Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1783033
  • 博文数量: 297
  • 博客积分: 285
  • 博客等级: 二等列兵
  • 技术积分: 3006
  • 用 户 组: 普通用户
  • 注册时间: 2010-03-06 22:04
个人简介

Linuxer, ex IBMer. GNU https://hmchzb19.github.io/

文章分类

全部博文(297)

文章存档

2020年(11)

2019年(15)

2018年(43)

2017年(79)

2016年(79)

2015年(58)

2014年(1)

2013年(8)

2012年(3)

分类: 大数据

2020-04-07 14:13:55

记录下最近在看的两个博客
刘建平Pinard

参考了

数据文件仍然使用了上一篇中的 才有了我这篇文章,在他的4种method之外,我多加了一个RandomForestClassifier, 但是RandomForestClassifier和ExtraTreesClassifier结果非常接近.

代码如下:

点击(此处)折叠或打开

  1. # coding: utf-8
  2. import pandas as pd, numpy as np
  3. import statsmodels.api as sm
  4. import statsmodels.formula.api as smf
  5. import seaborn as sns
  6. import matplotlib.pyplot as plt
  7. from sklearn.model_selection import train_test_split
  8. from sklearn.svm import SVC
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.preprocessing import StandardScaler
  11. from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix)
  12. from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier)
  13. sns.set()

  14. #Handle data
  15. col_names=['preg','plas','pres','skin','insu','mass','pedi','age','class']
  16. data=pd.read_csv('data/pima-indians-diabetes.csv',header=None,names=col_names)
  17. X=data.drop('class', axis=1)
  18. y=data['class']
  19. #pring X.shape and y.shape
  20. print(X.shape, y.shape,'\n')

  21. #using PCA -- Principal Component Analysis
  22. def pca_select():
  23.     from sklearn.decomposition import PCA

  24.     pca=PCA()
  25.     pca.fit(X)
  26.     #sum of this list almost == 1
  27.     print(sum(pca.explained_variance_ratio_))

  28.     #print the features according to their importance
  29.     print("Print the feature importance from biggest to smallest")
  30.     for i in pca.explained_variance_ratio_:
  31.         print('{:.6f}'.format(i))
  32.     
  33.     #or use map
  34.     '''
  35.     for i in map('{:.6f}'.format , pca.explained_variance_ratio_):
  36.         print(i,end='\t')
  37.     '''

  38.     print()
  39. pca_select()


  40. from sklearn.feature_selection import (f_classif,SelectKBest)
  41. def univariate_select():
  42.     
  43.     test=SelectKBest(score_func=f_classif, k=4)
  44.     fit=test.fit(X, y)
  45.     
  46.     print("feature scores: {}".format(fit.scores_))
  47.     
  48.     #a higher score means higher importance
  49.     for score, feature in sorted(zip(fit.scores_, list(X))):
  50.         print('score {:<20} of feature {}'.format(score, feature))

  51.     features=fit.transform(X)
  52.     #print first 5 line of these 4 features
  53.     print(features[0:5, :])
  54.     
  55.     #confirm the 4 features being selected are "preg", "plas", "mass", "age",
  56.     assert np.array_equal(features, np.array(X[["preg", "plas", "mass", "age", ]]))
  57.     
  58.     

  59. univariate_select()
  60.     
  61. from sklearn.feature_selection import RFE
  62. from sklearn.linear_model import LogisticRegression
  63. def RFE_select():
  64.     model = LogisticRegression(solver='lbfgs')
  65.     rfe=RFE(model, 4)
  66.     fit=rfe.fit(X, y)
  67.     
  68.     print('\n')
  69.     print("Num Features: {}".format(fit.n_features_))
  70.     print("Selected Features: {}".format(fit.support_))
  71.     print("Feature ranking, those with rank 1 will be choosen:\n{}".format(fit.ranking_))

  72.     #print the 4 feature names to be choosen
  73.     print("The 4 features to be choosen:")
  74.     for idx,value in enumerate(fit.ranking_):
  75.         if value==1:
  76.             print(list(X)[idx], end='\t')
  77.         
  78.         
  79. RFE_select()


  80. def rlf_select():
  81.     rnd_clf = RandomForestClassifier(random_state=0, n_estimators=100)
  82.     rnd_clf.fit(X, y)
  83.     rnd_name=rnd_clf.__class__.__name__
  84.     
  85.     feature_importances = rnd_clf.feature_importances_
  86.     importance = sorted(zip(feature_importances, list(X)), reverse=True)
  87.     
  88.     print('\n\n{} most important features ( {} )'.format(4, rnd_name))
  89.     print("The 4 features to be choosen:")
  90.     [print(row) for i, row in enumerate(importance) if i < 4]

  91. rlf_select()

  92. def et_select():
  93.     et_clf = ExtraTreesClassifier(random_state=0, n_estimators=100)
  94.     et_clf.fit(X, y)
  95.     et_name=et_clf.__class__.__name__
  96.     
  97.     feature_importances = et_clf.feature_importances_
  98.     importance = sorted(zip(feature_importances, list(X)), reverse=True)
  99.     
  100.     print('\n\n{} most important features ( {} )'.format(4, et_name))
  101.     print("The 4 features to be choosen:")
  102.     [print(row) for i, row in enumerate(importance) if i < 4]

  103. et_select()

阅读(922) | 评论(0) | 转发(0) |
0

上一篇:Standardization_example

下一篇:logistic_regression

给主人留下些什么吧!~~