https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5%E4%B8%8Ekaggle%E5%AE%9E%E6%88%98-machine-learning-for-kaggle-competition-in-python/php
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
# 讀取數據
iris = load_iris()
# 選取特徵與標籤
X_iris, y_iris = iris.data, iris.target # 選擇前兩列數據做爲特徵 X, y = X_iris[:, :2], y_iris # 選取一部分,25%的訓練數據做爲測試集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) # 對原特徵數據進行標準化預處理,這個其實挺重要,可是常常被一些選手忽略 scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) from sklearn.linear_model import SGDClassifier # 選擇使用SGD分類器,適合大規模數據,隨機梯度降低方法估計參數 clf = SGDClassifier() clf.fit(X_train, y_train) # 導入評價包 from sklearn import metrics y_train_predict = clf.predict(X_train) # 內測,使用訓練樣本進行準確性能評估 print metrics.accuracy_score(y_train, y_train_predict) # 標準外測,使用測試樣本進行準確性能評估 y_predict = clf.predict(X_test) print metrics.accuracy_score(y_test, y_predict)
0.660714285714 0.684210526316
# 若是須要更加詳細的性能報告,好比precision, recall, accuracy,可使用以下的函數。 print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
precision recall f1-score support setosa 1.00 1.00 1.00 8 versicolor 0.43 0.27 0.33 11 virginica 0.65 0.79 0.71 19 avg / total 0.66 0.68 0.66 38
# 若是想詳細探查SGDClassifier的分類性能,咱們須要充分利用數據,所以須要把數據切分爲N個部分,每一個部分都用於測試一次模型性能。
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 這裏使用Pipeline,便於精簡模型搭建,通常而言,模型在fit以前,對數據須要feature_extraction, preprocessing, 等必要步驟。
# 這裏咱們使用默認的參數配置
clf = Pipeline([('scaler', StandardScaler()), ('sgd_classifier', SGDClassifier())])
# 5折交叉驗證整個數據集合
cv = KFold(X.shape[0], 5, shuffle=True, random_state = 33)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
# 計算一下模型綜合性能,平均精度和標準差
print scores.mean(), scores.std() from scipy.stats import sem import numpy as np # 這裏使用的誤差計算函數略有不一樣,參考連接 http://www.graphpad.com/guides/prism/6/statistics/index.htm?stat_semandsdnotsame.htm print np.mean(scores), sem(scores)
[ 0.56666667 0.73333333 0.83333333 0.76666667 0.8 ] 0.74 0.0928559218479 0.74 0.0464279609239
from sklearn.datasets import fetch_olivetti_faces # 這部分數據沒有直接存儲在現有包中,都是經過這類函數在線下載 faces = fetch_olivetti_faces()
# 這裏證實,數據是以Dict的形式存儲的,與多數實驗性數據的格式一致
faces.keys()
['images', 'data', 'target', 'DESCR']
# 使用shape屬性檢驗數據規模
print faces.data.shape print faces.target.shape
(400L, 4096L) (400L,)
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC # 一樣是分割數據 25%用於測試 X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem # 構造一個便於交叉驗證模型性能的函數(模塊) def evaluate_cross_validation(clf, X, y, K): # KFold 函數須要以下參數:數據量, 叉驗次數, 是否洗牌 cv = KFold(len(y), K, shuffle=True, random_state = 0) # 採用上述的分隔方式進行交叉驗證,測試模型性能,對於分類問題,這些得分默認是accuracy,也能夠修改成別的 scores = cross_val_score(clf, X, y, cv=cv) print scores print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores)) # 使用線性核的SVC (後面會說到不一樣的核,結果可能大不相同) svc_linear = SVC(kernel='linear') # 五折交叉驗證 K = 5 evaluate_cross_validation(svc_linear, X_train, y_train, 5)
[ 0.93333333 0.86666667 0.91666667 0.93333333 0.91666667] Mean score: 0.913 (+/-0.012)
from sklearn.datasets import fetch_20newsgroups
# 與以前的人臉數據集同樣,20類新聞數據一樣須要臨時下載函數的幫忙 news = fetch_20newsgroups(subset='all')
# 查驗數據,依然採用dict格式,共有18846條樣本
print len(news.data), len(news.target) print news.target
18846 18846 [10 3 17 ..., 3 1 7]
# 查驗一下新聞類別和種數 print news.target_names print news.target_names.__len__()
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] 20
# 一樣,咱們選取25%的數據用來測試模型性能
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25)
print X_train.__len__() print y_train.__len__() print X_test.__len__()
14134 14134 4712
# 許多原始數據沒法直接被分類器所使用,圖像能夠直接使用pixel信息,文本則須要進一步處理成數值化的信息
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.cross_validation import * from scipy.stats import sem # 咱們在NB_Classifier的基礎上,對比幾種特徵抽取方法的性能。而且使用Pipline簡化構建訓練流程 clf_1 = Pipeline([('count_vec', CountVectorizer()), ('mnb', MultinomialNB())]) clf_2 = Pipeline([('hash_vec', HashingVectorizer(non_negative=True)), ('mnb', MultinomialNB())]) clf_3 = Pipeline([('tfidf_vec', TfidfVectorizer()), ('mnb', MultinomialNB())]) # 構造一個便於交叉驗證模型性能的函數(模塊) def evaluate_cross_validation(clf, X, y, K): # KFold 函數須要以下參數,數據量, K,是否洗牌 cv = KFold(len(y), K, shuffle=True, random_state = 0) # 採用上述的分隔方式進行交叉驗證,測試模型性能,對於分類問題,這些得分默認是accuracy,也能夠修改成別的 scores = cross_val_score(clf, X, y, cv=cv) print scores print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
clfs = [clf_1, clf_2, clf_3] for clf in clfs: evaluate_cross_validation(clf, X_train, y_train, 5)
[ 0.83516095 0.83374602 0.84471171 0.83622214 0.83227176] Mean score: 0.836 (+/-0.002) [ 0.76052352 0.72727273 0.77538026 0.74778918 0.75194621] Mean score: 0.753 (+/-0.008) [ 0.84435798 0.83409975 0.85496993 0.84082066 0.83227176] Mean score: 0.841 (+/-0.004)
# 從上述結果中,咱們發現經常使用的兩個特徵提取方法獲得的性能至關。 讓咱們選取其中之一,進一步靠特徵的精細篩選提高性能。 clf_4 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB())]) evaluate_cross_validation(clf_4, X_train, y_train, 5)
[ 0.87053414 0.86664308 0.887867 0.87371772 0.86553432] Mean score: 0.873 (+/-0.004)
# 若是再嘗試修改貝葉斯分類器的平滑參數,也許性能會更上一層樓。 clf_5 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB(alpha=0.01))]) evaluate_cross_validation(clf_5, X_train, y_train, 5)
[ 0.90060134 0.89741776 0.91651928 0.90909091 0.90410474] Mean score: 0.906 (+/-0.003)
# 這裏爲了處理數據方便,咱們引入一個新的工具包pandas
import pandas as pd
import numpy as np
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#瞧瞧數據,什麼數據特徵的都有,有數值型的、類別型的,字符串,甚至還有缺失的數據等等。
titanic.head()
# 使用pandas,數據都轉入pandas獨有的dataframe格式(二維數據表格),直接使用info(),查看數據的基本特徵
titanic.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1313 entries, 0 to 1312 Data columns (total 11 columns): row.names 1313 non-null int64 pclass 1313 non-null object survived 1313 non-null int64 name 1313 non-null object age 633 non-null float64 embarked 821 non-null object home.dest 754 non-null object room 77 non-null object ticket 69 non-null object boat 347 non-null object sex 1313 non-null object dtypes: float64(1), int64(2), object(8) memory usage: 123.1+ KB
# 這份調查數據是真實的泰坦尼克號乘客我的和登船信息,有助於咱們預測每位遇難乘客是否倖免。 # 一共1313條數據,有些特徵是完整的(好比 pclass, survived, name),有些是有缺失的;有些是數值類型的信息(age: float64),有些則是字符串。 # 機器學習有一個不太被初學者重視,而且耗時,可是十分重要的一環,特徵的選擇,這個須要基於一些背景知識。根據咱們對這場事故的瞭解,sex, age, pclass這些都頗有多是決定倖免與否的關鍵因素。 # we keep pclass, age, sex. X = titanic[['pclass', 'age', 'sex']] y = titanic['survived']
X.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1313 entries, 0 to 1312 Data columns (total 3 columns): pclass 1313 non-null object age 633 non-null float64 sex 1313 non-null object dtypes: float64(1), object(2) memory usage: 41.0+ KB
# 下面有幾個對數據處理的任務 # 1) age這個數據列,只有633個 # 2) sex 與 pclass兩個數據列的值都是類別型的,須要轉化爲數值特徵,用0/1代替 # 首先咱們補充age裏的數據,使用平均數或者中位數都是對模型偏離形成最小影響的策略 X['age'].fillna(X['age'].mean(), inplace=True)
C:\Anaconda2\lib\site-packages\pandas\core\generic.py:2748: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self._update_inplace(new_data)
X.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1313 entries, 0 to 1312 Data columns (total 3 columns): pclass 1313 non-null object age 1313 non-null float64 sex 1313 non-null object dtypes: float64(1), object(2) memory usage: 41.0+ KB
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) # 咱們使用scikit-learn中的feature_extraction from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) print vec.feature_names_ # 咱們發現,凡是類別型的特徵都單獨剝離出來,獨成一列特徵,數值型的則保持不變
['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
X_test = vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) dtc.fit(X_train, y_train) dtc.score(X_test, y_test)
0.79331306990881456
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=5)
rfc.fit(X_train, y_train) rfc.score(X_test, y_test)
0.77203647416413379
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5)
gbc.fit(X_train, y_train) gbc.score(X_test, y_test)
0.79027355623100304
from sklearn.metrics import classification_report y_predict = gbc.predict(X_test) print classification_report(y_predict, y_test) # 這裏的函數能夠便於生成分類器性能報告(precision,recall)這些是在二分類背景下才有的指標。
precision recall f1-score support 0 0.93 0.78 0.84 241 1 0.57 0.83 0.68 88 avg / total 0.83 0.79 0.80 329
# 首先預讀房價數據
from sklearn.datasets import load_boston
boston = load_boston()
# 查驗數據規模
print boston.data.shape
(506L, 13L)
# 多多弄懂數據特徵的含義也是一個好習慣
print boston.feature_names
print boston.DESCR
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
'B' 'LSTAT']
Boston House Prices dataset
Notes
------
Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive
:Median Value (attribute 14) is usually the target
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft. - INDUS proportion of non-retail business acres per town - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - NOX nitric oxides concentration (parts per 10 million) - RM average number of rooms per dwelling - AGE proportion of owner-occupied units built prior to 1940 - DIS weighted distances to five Boston employment centres - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's :Missing Attribute Values: None :Creator: Harrison, D. and Rubinfeld, D.L. This is a copy of UCI ML housing dataset. http://archive.ics.uci.edu/ml/datasets/Housing This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter. The Boston house-price data has been used in many machine learning papers that address regression problems. **References** - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261. - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann. - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
# 這裏多一個步驟,查驗數據是否正規化,通常都是沒有的
import numpy as np
print np.max(boston.target) print np.min(boston.target) print np.mean(boston.target)
50.0 5.0 22.5328063241
from sklearn.cross_validation import train_test_split
# 依然如故,咱們對數據進行分割
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state=33) from sklearn.preprocessing import StandardScaler # 正規化的目的在於避免原始特徵值差別過大,致使訓練獲得的參數權重不一 scalerX = StandardScaler().fit(X_train) X_train = scalerX.transform(X_train) X_test = scalerX.transform(X_test) scalery = StandardScaler().fit(y_train) y_train = scalery.transform(y_train) y_test = scalery.transform(y_test)
# 先把評價模塊寫好,依然是默認5折交叉驗證,只是這裏的評價指標再也不是精度,而是另外一個函數R2,大致上,這個得分多少表明有多大百分比的迴歸結果能夠被訓練器覆蓋和解釋
from sklearn.cross_validation import *
def train_and_evaluate(clf, X_train, y_train):
cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33) scores = cross_val_score(clf, X_train, y_train, cv=cv) print 'Average coefficient of determination using 5-fold cross validation:', np.mean(scores) #最後讓咱們看看有多少種迴歸模型能夠被使用(其實有更多)。 # 比較有表明性的有3種
# 先用線性模型嘗試, SGD_Regressor from sklearn import linear_model # 這裏有一個正則化的選項penalty,目前14維特徵也許不會有太大影響 clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42) train_and_evaluate(clf_sgd, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.710809853468
# 再換一個SGD_Regressor的penalty參數爲l2,結果貌似影響不大,由於特徵太少,正則化意義不大 clf_sgd_l2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42) train_and_evaluate(clf_sgd_l2, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.71081206667
# 再看看SVM的regressor怎麼樣(都是默認參數),
from sklearn.svm import SVR # 使用線性核沒有啥子提高,可是由於特徵少,因此能夠考慮升高維度 clf_svr = SVR(kernel='linear') train_and_evaluate(clf_svr, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.707838419194
clf_svr_poly = SVR(kernel='poly') # 升高維度,效果明顯,可是此招慎用@@,特徵高的話, CPU仍是受不了,內存卻是小事。其實到了如今,連咱們本身都沒辦法直接解釋這些特徵的具體含義了。 train_and_evaluate(clf_svr_poly, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.779288545488
clf_svr_rbf = SVR(kernel='rbf') # RBF (徑向基核更是牛逼!) train_and_evaluate(clf_svr_rbf, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.833662221567
# 再來個更猛的! 極限迴歸森林,放大招了!!! from sklearn import ensemble clf_et = ensemble.ExtraTreesRegressor() train_and_evaluate(clf_et, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.853006383633
# 最後看看在測試集上的表現
clf_et.fit(X_train, y_train) clf_et.score(X_test, y_test)
0.83781467779895469
import numpy as np # 先熱個身,牛刀小試 M = np.array([[1, 2], [2, 4]]) M
np.linalg.matrix_rank(M, tol=None) # 獲取M矩陣的秩=1
# 載入手寫數字的圖像像素數據。對於圖像處理,除了後續的各類啓發式提取有效特徵之外, # 最直接經常使用的就是像素數據,每一個像素都是一個數值,反映顏色。 from sklearn.datasets import load_digits digits = load_digits() # 這些經典數據的存儲格式很是統一。這是好習慣,統一了接口,也便於快速使用。 digits
# 老套路 X_digits, y_digits = digits.data, digits.target
from sklearn.decomposition import PCA from matplotlib import pyplot as plt # 最關鍵的參數就是n_components = 2個主成分 estimator = PCA(n_components=2) X_pca = estimator.fit_transform(X_digits) # scikit-learn的接口設計的很統一。 # 聚類問題常常須要直觀的展示數據,降維度的一個直接目的也爲此;所以咱們這裏多展示幾個圖片直觀一些。 def plot_pca_scatter(): colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray'] for i in xrange(len(colors)): px = X_pca[:, 0][y_digits == i] py = X_pca[:, 1][y_digits == i] plt.scatter(px, py, c=colors[i]) plt.legend(digits.target_names) plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.show() plot_pca_scatter()
# 這部分代碼和原著的第四章節有相同的效果,可是充分利用pandas會表達的更加簡潔,所以我從新編寫了更加清晰簡潔的代碼。
import pandas as pd
import numpy as np
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') print titanic.info() # 仍是這組數據 titanic.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1313 entries, 0 to 1312 Data columns (total 11 columns): row.names 1313 non-null int64 pclass 1313 non-null object survived 1313 non-null int64 name 1313 non-null object age 633 non-null float64 embarked 821 non-null object home.dest 754 non-null object room 77 non-null object ticket 69 non-null object boat 347 non-null object sex 1313 non-null object dtypes: float64(1), int64(2), object(8) memory usage: 123.1+ KB None
# 咱們丟掉一些過於特異的,不利於找到共同點的數據列, row.names, name, 同時分離出預測列。 y = titanic['survived'] X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
# 對於連續的數值特徵,咱們採用補完的方式 X['age'].fillna(X['age'].mean(), inplace=True) X.fillna('UNKNOWN', inplace=True)
# 剩下的類別類型數據,咱們直接向量化,這樣的話,對於有空白特徵的列,咱們也單獨視做一個特徵 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
474
X_train.toarray()
array([[ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 0. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ], ..., [ 12. , 0. , 0. , ..., 0. , 0. , 1. ], [ 18. , 0. , 0. , ..., 0. , 0. , 1. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ]])
from sklearn.tree import DecisionTreeClassifier dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train, y_train) dt.score(X_test, y_test) # 採用全部特徵的測試精度
0.81762917933130697
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) X_train_fs = fs.fit_transform(X_train, y_train) dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) dt.score(X_test_fs, y_test) # 採用20%高預測性特徵的測試精度
0.82370820668693012
from sklearn.cross_validation import cross_val_score
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i) X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_val_score(dt, X_train_fs, y_train, cv=5) results = np.append(results, scores.mean()) print results opt = np.where(results == results.max())[0] print 'Optimal number of features %d' %percentiles[opt] import pylab as pl pl.plot(percentiles, results) pl.show()
[ 0.85063904 0.85673057 0.87501546 0.88622964 0.86590394 0.87097506 0.87303649 0.86997526 0.87097506 0.87300557 0.86997526 0.86893424 0.87098536 0.86490414 0.86385281 0.86791383 0.86488353 0.86892393 0.86791383 0.86284271 0.86487322 0.86792414 0.86894455 0.87303649 0.86892393 0.86998557 0.86689342 0.86488353 0.86895485 0.86689342 0.87198516 0.8638322 0.86488353 0.87402597 0.87299526 0.87098536 0.86997526 0.86892393 0.86794475 0.86486291 0.87096475 0.86587302 0.86387343 0.86083282 0.86589363 0.8608019 0.86492476 0.85774067 0.8608122 0.85779221] Optimal number of features 7
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7) X_train_fs = fs.fit_transform(X_train, y_train) dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) dt.score(X_test_fs, y_test) # 選取搜索到的最好特徵比例的測試精度
0.8571428571428571
# 因而可知,這個技術對於工程上提高精度仍是很是有幫助的。
from sklearn.datasets import fetch_20newsgroups import numpy as np news = fetch_20newsgroups(subset='all')
# 咱們首先使用grid_search的單核版本
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33) from sklearn.svm import SVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())]) # 這裏須要試驗的2個超參數的的個數分別是四、3, svc__gamma的參數共有10^-2, 10^-1... # 這樣咱們一共有12種的超參數組合,12個不一樣參數下的模型 parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)} # 再考慮每一個模型須要交叉驗證3次,所以一共須要訓練36次模型,根據下面的結果,單線程下,每一個模型的訓練任務耗時5秒左右。 gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3) %time _=gs.fit(X_train, y_train) gs.best_params_, gs.best_score_ print gs.score(X_test, y_test)
Fitting 3 folds for each of 12 candidates, totalling 36 fits [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.1s [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.3s [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.2s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.1s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.2s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.3s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.7s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.8s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.9s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.4s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.2s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.2s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.3s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.4s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.3s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.4s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.5s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.3s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.3s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.3s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.3s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.5s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.7s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.9s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 5.1s [Parallel(n_jobs=1)]: Done 36 out of 36 | elapsed: 3.3min finished
Wall time: 3min 27s 0.822666666667
# 而後咱們採用多線程並行搜索,觀察時間性能的提升狀況
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33) from sklearn.svm import SVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())]) parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)} gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1) %time _=gs.fit(X_train, y_train) gs.best_params_, gs.best_score_ print gs.score(X_test, y_test) # 並行化尋找最優的超參數配置,一樣得到相同的最優解,可是訓練耗時基本上隨着CPU核的數量成倍減小。
[Parallel(n_jobs=-1)]: Done 1 jobs | elapsed: 8.4s [Parallel(n_jobs=-1)]: Done 22 out of 36 | elapsed: 30.3s remaining: 19.2s [Parallel(n_jobs=-1)]: Done 36 out of 36 | elapsed: 46.8s finished
Fitting 3 folds for each of 12 candidates, totalling 36 fits Wall time: 56.5 s 0.822666666667
# 這裏須要補充的是獲得這個結果的機器的配置,好讓讀者有一個對並行計算更好的瞭解。 ''' CPU: i7 四核 2.4Ghz Memory: DDR3 1600 32GB '''