並行處理、流水線處理、自動化調參、持久化是sklearn優雅地進行數據挖掘的核心。html
from numpy import hstack, vstack, array, median, nan from numpy.random import choice from sklearn.datasets import load_iris iris = load_iris() iris.data #特徵矩陣加工 #使用vstack增長一行含缺失值的樣本(nan, nan, nan, nan) #使用hstack增長一列表示花的顏色(0-白、1-黃、2-紅),花的顏色是隨機的,意味着顏色並不影響花的分類 iris.data = hstack((choice([0, 1, 2], size=iris.data.shape[0]+1).reshape(-1,1), vstack((iris.data, array([nan, nan, nan, nan]).reshape(1,-1))))) #目標值向量加工 #增長一個目標值,對應含缺失值的樣本,值爲衆數 iris.target = hstack((iris.target, array([median(iris.target)])))
下標是上述介紹的技術在sklearn說對應的方法或者類,以便於查詢,具體使用後面部分將詳細展開。數組
包 | 類或方法 | 說明 |
sklearn.pipeline | Pipeline | 流水線處理 |
sklearn.pipeline | FeatureUnion | 並行處理 |
sklearn.model_selection | GridSearchCV | 網絡搜索調參 |
externals.joblib | dump | 數據持久化 |
externals.joblib | load | 從文件系統中加載數據至內存 |
並行處理能夠分爲總體並行處理和部分並行處理,其區別以下:網絡
代碼以下:dom
from numpy import log1p from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import Binarizer from sklearn.pipeline import FeatureUnion step2_1 = ('ToLog', FunctionTransformer(log1p)) step2_2 = ('ToBinary', Binarizer()) step2 = ('FeatureUnion', FeatureUnion(transformer_list=[step2_1, step2_2]))
在某些特定場景下,咱們只須要對特徵矩陣的某些列進行轉換,而不是全部列,所以能夠使用部分並行處理,代碼以下:函數
from sklearn.pipeline import FeatureUnion, _fit_one_transformer, _fit_transform_one, _transform_one from sklearn.externals.joblib import Parallel, delayed from scipy import sparse import numpy as np #部分並行處理,繼承FeatureUnion class FeatureUnionExt(FeatureUnion): #相比FeatureUnion,多了idx_list參數,其表示每一個並行工做須要讀取的特徵矩陣的列 def __init__(self, transformer_list, idx_list, n_jobs=1, transformer_weights=None): self.idx_list = idx_list FeatureUnion.__init__(self, transformer_list=map(lambda trans:(trans[0], trans[1]), transformer_list), n_jobs=n_jobs, transformer_weights=transformer_weights) #因爲只部分讀取特徵矩陣,方法fit須要重構 def fit(self, X, y=None): transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) transformers = Parallel(n_jobs=self.n_jobs)( #從特徵矩陣中提取部分輸入fit方法 delayed(_fit_one_transformer)(trans, X[:,idx], y) for name, trans, idx in transformer_idx_list) self._update_transformer_list(transformers) return self #因爲只部分讀取特徵矩陣,方法fit_transform須要重構 def fit_transform(self, X, y=None, **fit_params): transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) result = Parallel(n_jobs=self.n_jobs)( #從特徵矩陣中提取部分輸入fit_transform方法 delayed(_fit_transform_one)(trans, name, X[:,idx], y, self.transformer_weights, **fit_params) for name, trans, idx in transformer_idx_list) Xs, transformers = zip(*result) self._update_transformer_list(transformers) if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: Xs = np.hstack(Xs) return Xs #因爲只部分讀取特徵矩陣,方法transform須要重構 def transform(self, X): transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) Xs = Parallel(n_jobs=self.n_jobs)( #從特徵矩陣中提取部分輸入transform方法 delayed(_transform_one)(trans, name, X[:,idx], self.transformer_weights) for name, trans, idx in transformer_idx_list) if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: Xs = np.hstack(Xs) return Xs
咱們對特徵矩陣的第1列進行定性特徵編碼,對第二、三、4列進行對數函數轉換,對第5列進行定量特徵二值化處理,代碼以下:編碼
from numpy import log1p from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import Binarizer step2_1 = ('OneHotEncoder', OneHotEncoder(sparse=False)) step2_2 = ('ToLog', FunctionTransformer(log1p)) step2_3 = ('ToBinary', Binarizer()) step2 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=[step2_1, step2_2, step2_3], idx_list=[[0], [1, 2, 3], [4]]))
流水線上除了最後一個工做外,都要執行fit_transform方法,上一個工做的輸出做爲下一個工做的輸入,最後一個工做必須實現fit方法,輸入爲上一個工做的輸出,代碼以下:spa
from numpy import log1p from sklearn.preprocessing import Imputer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import Binarizer from sklearn.preprocessing import MinMaxScaler from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline step1 = ('Imputer', Imputer()) step2_1 = ('OneHotEncoder', OneHotEncoder(sparse=False)) step2_2 = ('ToLog', FunctionTransformer(log1p)) step2_3 = ('ToBinary', Binarizer()) step2 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=[step2_1, step2_2, step2_3], idx_list=[[0], [1, 2, 3], [4]])) step3 = ('MinMaxScaler', MinMaxScaler()) step4 = ('SelectKBest', SelectKBest(chi2, k=3)) step5 = ('PCA', PCA(n_components=2)) step6 = ('LogisticRegression', LogisticRegression(penalty='l2')) pipeline = Pipeline(steps=[step1, step2, step3, step4, step5, step6])
使用網格搜索調參,代碼以下:code
from sklearn.model_selection import GridSearchCV #新建網格搜索對象 #第一參數爲待訓練的模型 #param_grid爲待調參數組成的網格,字典格式,鍵爲參數名稱(格式「對象名稱__子對象名稱__參數名稱」),值爲可取的參數值列表 grid_search = GridSearchCV(pipeline, param_grid={'FeatureUnionExt__ToBinary__threshold':[1.0, 2.0, 3.0, 4.0], 'LogisticRegression__C':[0.1, 0.2, 0.4, 0.8]}) grid_search.fit(iris.data, iris.target)
代碼以下:component
dump(grid_search, 'grid_search.dmp', compress=3) grid_search = load('grid_search.dmp')