這幾天在看 sklearn 的文檔,發現他的分類器有不少,這裏作一些簡略的記錄。git
大體能夠將這些分類器分紅兩類: 1)單一分類器,2)集成分類器bootstrap
下面這個例子對一些單一分類器效果作了比較dom
from sklearn.cross_validation import cross_val_score from sklearn.datasets import make_blobs # meta-estimator from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis classifiers = { 'KN': KNeighborsClassifier(3), 'SVC': SVC(kernel="linear", C=0.025), 'SVC': SVC(gamma=2, C=1), 'DT': DecisionTreeClassifier(max_depth=5), 'RF': RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1), # clf.feature_importances_ 'ET': ExtraTreesClassifier(n_estimators=10, max_depth=None), # clf.feature_importances_ 'AB': AdaBoostClassifier(n_estimators=100), 'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), # clf.feature_importances_ 'GNB': GaussianNB(), 'LD': LinearDiscriminantAnalysis(), 'QD': QuadraticDiscriminantAnalysis()} X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0) for name, clf in classifiers.items(): scores = cross_val_score(clf, X, y) print(name,'\t--> ',scores.mean())
下圖是效果圖:spa
集成分類器有四種:Bagging, Voting, GridSearch, PipeLine。最後一個PipeLine實際上是管道技術rest
from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier meta_clf = KNeighborsClassifier() bg_clf = BaggingClassifier(meta_clf, max_samples=0.5, max_features=0.5)
from sklearn import datasets from sklearn import cross_validation from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[2,1,2]) for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
import numpy as np from sklearn.datasets import load_digits from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from sklearn.grid_search import RandomizedSearchCV # 生成數據 digits = load_digits() X, y = digits.data, digits.target # 元分類器 meta_clf = RandomForestClassifier(n_estimators=20) # ================================================================= # 設置參數 param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 運行隨機搜索 RandomizedSearch n_iter_search = 20 rs_clf = RandomizedSearchCV(meta_clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() rs_clf.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(rs_clf.grid_scores_) # ================================================================= # 設置參數 param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 運行網格搜索 GridSearch gs_clf = GridSearchCV(meta_clf, param_grid=param_grid) start = time() gs_clf.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(gs_clf.grid_scores_))) print(gs_clf.grid_scores_)
第一個例子code
from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.pipeline import Pipeline # 生成數據 X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42) # 定義Pipeline,先方差分析,再SVM anova_filter = SelectKBest(f_regression, k=5) clf = svm.SVC(kernel='linear') pipe = Pipeline([('anova', anova_filter), ('svc', clf)]) # 設置anova的參數k=10,svc的參數C=0.1(用雙下劃線"__"鏈接!) pipe.set_params(anova__k=10, svc__C=.1) pipe.fit(X, y) prediction = pipe.predict(X) pipe.score(X, y) # 獲得 anova_filter 選出來的特徵 s = pipe.named_steps['anova'].get_support() print(s)
第二個例子component
import numpy as np from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target # 定義管道,先降維(pca),再邏輯迴歸 pca = decomposition.PCA() logistic = linear_model.LogisticRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) # 把管道再做爲grid_search的estimator n_components = [20, 40, 64] Cs = np.logspace(-4, 4, 3) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs)) estimator.fit(X_digits, y_digits)