用標準Python類庫導入html
from csv import reader import numpy as np filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' with open(filename, 'rt') as raw_data: readers = reader(raw_data, delimiter=',') x = list(readers) data = np.array(x).astype('float') print(data.shape)
用NumPy導入數據python
from numpy import loadtxt filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' with open(filename, 'rt') as raw_data: data = loadtxt(raw_data, delimiter=',') print(data.shape)
採用Pandas導入算法
from pandas import read_csv filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' names = ['name', 'landmass', 'zone', 'area', 'population', 'language', 'religion', 'bars', 'stripes','colours','red','green','blue','gold','white','black','orange','mainhue','circles','crosses','saltires','quarters','sunstars','crescent','triangle','icon','animate','text','topleft','botright'] data = read_csv(filename, names=names, delim_whitespace=False) print(data.shape)
描述性統計 分析數據網絡
```[python] # 簡單地查看數據 print(data.head(10)) # 數據的維度 print(data.shape) # 數據的屬性和類型 print(data.dtypes) # 描述性統計 set_option('display.width',100) #設置對齊寬度 set_option('precision',4) # 設置數據的精度 print(data.describe()) # 數據分組分佈 print(data.groupby('class).size()) # 數據相關性 set_option('display.width',100) #設置對齊寬度 set_option('precision',2) # 設置數據的精度 print(data.corr(method='pearson')) # 計算數據的高斯偏離 print(data.skew()) ```
數據可視化 觀察數據app
import matplotlib.pyplot as plt # 直方圖 data.hist() # 密度圖 data.plot(kind='density',subplots=True,layout=(3,3),sharex=False) # 箱線圖 data.plot(kind='box',subplots=True,layout=(3,3),sharex=False) # 相關矩陣圖 correlations = data.corr() fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(correlations,vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0,9,1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names) ax.set_yticklabels(names) # 散點矩陣圖 from pandas.plotting import scatter_matrix scatter_matrix(data) plt.show()
數據清洗dom
經過刪除重複數據、標記錯誤數值,甚至標記錯誤的輸入數據來清洗數據特徵選擇
移除多餘的特徵屬性,增長新的特徵屬性機器學習
# 將數據分爲輸入數據和輸出結果 array = data.values x = array[:,0:8] y = array[:,8] # 單變量特徵選定,經過卡方檢驗,經過統計樣本的實際觀測值與理論推斷值之間的偏離程度(卡方值),進行判斷的,卡方值越小,誤差越小,越趨於符合 from numpy import set_printoptions from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 test = SelectKBest(score_func=chi2,k=4) fit = test.fit(x,y) set_printoptions(precision=3) print(fit.scores_) features = fit.transform(x) print(features) # 經過遞歸特徵消除來選定特徵 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model = LogisticRegression() rfe = RFE(model,3) fit = rfe.fit(x,y) print('特徵個數:',fit.n_features_) print('被選定的特徵:',fit.support_) print('特徵排名:',fit.ranking_) # 主要成分分析選定特徵數據 from sklearn.decomposition import PCA pca = PCA(n_components=3) fit = pca.fit(x) print('解釋方差:%s' % fit.explained_variance_ratio_) print(fit.components_) # 特徵重要性 經過決策樹計算特徵的重要性 from sklearn.ensemble import ExtraTreeClassifier model = ExtraTreesClassifier() fit = model.fit(x,y) print(fit.feature_importances_)
數據轉換
對數據尺度進行調整或者調整數據的分佈,以便更好地展現問題函數
from numpy import set_printoptions # 將數據分爲輸入數據和輸出結果 array = data.values x = array[:,0:8] y = array[:,8] # 調整數據尺度 將數據的各個屬性按照相同的尺度來度量數據,使用於梯度降低、迴歸、神經網絡和K近鄰等 from sklearn.preprocessing import MinMaxScaler transformer = MinMaxScaler(feature_range=(0,1)) newX = transform(x) # 正態化數據 輸出結果以0爲中位數,方差爲1,做爲高斯分佈算法的輸入,使用於線性迴歸、邏輯迴歸、線性判別分析等 from sklearn.preprocessing import StandardScaler transformer = StandardScaler().fit(x) newX = transformer.transform(x) # 標準化數據(歸一元處理) 將每一行的數據的距離處理成1,適合處理稀疏矩陣,適用於 使用權重輸入的神經網絡和使用距離的K近鄰算法 from sklearn.preprocessing import Normalizer transformer = Normalizer().fit(x) newX = transformer.transform(x) # 二值數據 將數據轉化爲爲二值,大於閾值設置爲1,小於閾值設置爲0,在明確值或特徵工程增長屬性的時候使用 from sklearn.preprocessing import Binarizer transformer = Binarizer(threshold=0.0).fit(x) newX = transformer.transform(x) # 設定數據的打印格式,並輸出結果 set_printoptions(precision=3) print(newX)
分離數據集學習
from sklearn.linear_model import LogisticRegression # 分離數據集合評估數據集 from sklearn.model_selection import train_test_split test_size = 0.33 seed = 4 x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size,random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) result = model.score(x_test,y_test) print('算法的評估結果:%.3f%%' % (result * 100)) # K折交叉驗證分離 將原始數據分爲K組,將每一個子集數據分別作一次驗證集,其他K-1組子集數據做爲訓練集,這樣會獲得K個模型,利用這K個模型最終的驗證集的分類準確率的平均數做爲分類器的指標 from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法評估結果:%.3f%% (%.3f%%)' % (result.mean() * 100, result.std() * 100)) # 棄一交叉驗證分離 每一個樣本單獨做爲驗證集,其他的N-1個樣本做爲訓練集,而後取N個模型最終驗證集的分類準確率的平均數 # 和K折交叉驗證相比而言,棄一交叉驗證的優勢:1. 每一回閤中幾乎全部的樣本皆用於訓練模型 2. 實驗過程當中沒有隨機因素會影響實驗數據,實驗過程是能夠被複制的 from sklearn.model_selection import LeaveOneOut from sklearn.model_selection import cross_val_score loocv = LeaveOneOut() model = LogisticRegression() result = cross_val_score(model, x, y, cv=loocv) print('算法評估結果:%.3f%% (%.3f%%)' % (result.mean()*100, result.std()*100)) # 重複隨機分離評估數據集與訓練數據集 from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import cross_val_score n_splits = 10 test_size = 0.33 seed = 7 kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法評估結果:%.3f%% (%.3f%%)' % (result.mean()*100,result.std()*100))
分類算法矩陣優化
from sklearn.linear_model import LogisticRegression # 分類準確度 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法評估結果準確度:%.3f (%.3f)' % (result.mean(), result.std())) # 對數損失函數 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() scoring = 'neg_log_loss' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('Logloss %.3f (%.3f)' % (result.mean(),result.std())) # AUC圖 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() scoring = 'roc_auc' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('AUC %.3f (%.3f)' % (result.mean(), result.std())) # 混淆矩陣 from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix test_size = 0.33 seed = 4 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) predicted = model.predict(x_test) matrix = confusion_matrix(y_test,predicted) classes = ['0','1'] dataframe = pd.DataFrame(data=matrix, index=classes, columns =classes) print(dataframe) # 分類報告 # 精確率 計算全部被檢索到的項目中應該被檢索到的項目所佔的比例 # 召回率 計算全部檢索到的項目佔全部應該檢索到的想的比例 from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report test_size = 0.33 seed = 4 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=test_size,random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) predicted = model.predict(x_test) report = classification_report(y_test,predicted) print(report)
迴歸算法矩陣
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.linear_model import LinearRegression n_splits = 10 seed = 7 kfold = KFold(n_splits=n_splits, random_state=seed) model = LinearRegression() # 平均絕對偏差 全部單個觀測值與算術平均值的誤差的絕對值的平均值 scoring = 'neg_mean_absolute_error' # 均方偏差 均方偏差的算術平方根 scoring = 'neg_mean_squared_error' # 決定係數 反映因變量的所有變異能經過迴歸關係被自變量解釋的比例 scoring = 'r2' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('%.3f (%.3f)' % (result.mean(), result.std()))
審查分類算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) # 線性算法 # 邏輯迴歸 經過擬合一個邏輯函數,來預測一個事件發生的機率,輸出值爲0~1,很是適合處理二分類問題 from sklearn.linear_model import LogisticRegression model = LogisticRegression() # 線性判別分析 將高維的模式樣本投影到最佳鑑別矢量空間,以達到抽取分類信息和壓縮特徵空間維數的效果,投影后,模式在該空間中有最佳的可分離性。線性判別分析與主要成分分析同樣,被普遍應用在數據降維中 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis model = LinearDiscriminantAnalysis() # 非線性算法 # K近鄰算法 若是一個樣本在特徵空間中的k個最類似的樣本中的大多數屬於某一個類別,則該樣本也屬於這個類別。 from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier() # 貝葉斯分類器 經過某對象的先驗機率,利用貝葉斯公式計算出其在全部類別上的後驗機率,選擇具備最大後驗機率的類做爲該對象所屬的類 from sklearn.native_bayes import GaussianNB model = GaussianNB() # 分類與迴歸樹 等價於遞歸二分每一個特徵,在輸入空間劃分爲有限個單元並在這些單元上肯定預測的機率分佈 from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() # 支持向量機 能夠分析數據、識別模式,用於分類和迴歸分析 from sklearn.svm import SVC model = SVC() result = cross_val_score(model,x,y,cv=kfold) print(result.mean())
審查迴歸算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) # 線性算法 # 線性迴歸算法 利用數理統計中的迴歸分析,來肯定兩種或兩種以上變量間相互依賴的定量關係的一種統計方法 from sklearn.linear_model import LinearRegression model = LinearRegression() # 嶺迴歸算法 一種專門用於共線性數據分析的有偏估計迴歸方法(最小二乘法的改良版) from sklearn.linear_model import Ridge model = Ridge() # 套索迴歸算法 和嶺迴歸算法相似,使用的懲罰函數是絕對值而不是平方 from sklearn.linear_model import Lasso model = Lasso() # 彈性網絡迴歸算法 是套索迴歸算法和嶺迴歸算法的混合體 當有多個相關的特徵時 彈性網絡迴歸算法是頗有用的 from sklearn.linear_model import ElasticNet model = ElasticNet() # 非線性算法 # K近鄰算法 按照距離來預測結果 from sklearn.neighbors import KNeighborsRegressor model = KNeighborsRegressor() # 分類與迴歸樹 from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() # 支持向量機 from sklearn.svm import SVR model = SVR() scoring = 'neg_mean_squared_error' result = cross_val_score(model, x, y, cv=kfold, scoring=scoring) print('%.3f' % result.mean())
算法比較
from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB from matplotlib import pyplot num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) models = {} models['LR'] = LogisticRegression() models['LDA'] = LinearDiscriminantAnalysis() models['KNN'] = KNeighborsClassifier() models['CART'] = DecisionTreeClassifier() models['SVM'] = SVC() models['NB'] = GaussianNB() results = [] for name in models: result = cross_val_score(models[name], X, Y, cv=kfold) results.append(result) msg = '%s: %.3f (%.3f)' % (name, result.mean(), result.std()) print(msg) # 圖表顯示 fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()
網格搜索優化參數
from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV # 算法實例化 model = Ridge() # 設置要遍歷的參數 param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]} # 經過網格搜索查詢最優參數 grid = GridSearchCV(estimator=model, param_grid=param_grid) grid.fit(x, y) # 搜索結果 print('最高得分:%.3f' % grid.best_score_) print('最優參數:%s' % grid.best_estimator_.alpha)
隨機搜索優化參數
from sklearn.linear_model import Ridge from sklearn.model_selection import RandomizedSearchCV from scipy.stats import uniform model = Ridge() # 設置要遍歷的參數 param_grid = {'alpha': uniform()} # 經過網格搜索查詢最優參數 grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7) grid.fit(x, y) # 搜索結果 print('最高得分:%.3f' % grid.best_score_) print('最優參數:%s' % grid.best_estimator_.alpha)
集成算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) num_tree = 100 # 裝袋算法 經過給定組合投票的方式得到最優解 # 裝袋決策樹 from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier cart = DecisionTreeClassifier() model = BaggingClassifier(base_estimator=cart, n_estimators=num_tree, random_state=seed) # 隨機森林 用隨機的方式創建一個森林,森林由不少的決策樹組成,且每棵決策樹之間是沒有關聯的 from sklearn.ensemble import RandomForestClassifier max_features = 3 model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features) # 極端隨機樹 和隨機森林相似,區別以下: # 1. 隨機森林應用的是Bagging模型,極端隨機樹的每棵決策樹應用的是相同的所有訓練樣本 # 2. 隨機森林是在一個隨機子集內獲得最優分叉特徵屬性,而極端隨機樹是徹底隨機地選擇分叉特徵屬性從而實現對決策樹進行分叉的 from sklearn.ensemble import ExtraTreesClassifier max_features = 7 model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features) # 提高算法 提升弱分類算法準確度的方法,也是一種提升任意給定學習算法準確度的方法 # AdaBoost 是一種迭代算法,針對同一個訓練集訓練不一樣的分類器(弱分類器),而後把這些弱分類器集合起來,構成一個更強的最終分類器(強分類器) from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed) # 隨機梯度提高 沿着函數的梯度方向找到某個函數的最大值。每次只用一個樣本點來更新迴歸係數 from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed) result = cross_val_score(model, x, y, cv=kfold) # 投票算法 經過建立兩個或多個算法模型。利用投票算法將這些算法包裝起來,計算各個子模型的平均預測情況 cart = DecisionTreeClassifier() models = [] model_logistic = LogisticRegression() models.append(('logistic', model_logistic)) model_cart = DecisionTreeClassifier() models.append(('cart', model_cart)) model_svc = SVC() models.append(('svm', model_svc)) ensemble_model = VotingClassifier(estimators=models) result = cross_val_score(ensemble_model, x, y, cv=kfold) print(result.mean())
實現
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression # 經過pickle 序列化和反序列化機器學習的模型 from pickle import dump from pickle import load # 經過joblib 序列化和反序列化機器學習的模型 from sklearn.externals.joblib import dump from sklearn.externals.joblib import load test_size = 0.33 seed = 4 x_train, x_test, y_traing, y_test = train_test_split(x, y, test_size=test_size, random_state=seed) model = LogisticRegression() model.fit(x_train, y_traing) model_file = 'finalized_model.sav' with open(model_file, 'wb') as model_f: dump(model, model_f) with open(model_file, 'rb') as model_f: loaded_model = load(model_f) result = loaded_model.score(x_test, y_test) print("算法評估結果:%.3f%%" % (result * 100))
注:本文根據《機器學習 Python實踐》整理總結所得
如需轉載請註明出處:http://www.javashuo.com/article/p-qgwkoowa-ka.html