目錄html
更新、更全的《機器學習》的更新網站,更有python、go、數據結構與算法、爬蟲、人工智能教學等着你:http://www.javashuo.com/article/p-vozphyqp-cm.htmlpython
import time import numpy as np import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier %matplotlib inline font = FontProperties(fname='/Library/Fonts/Heiti.ttc')
# 導入手寫識別數字數據集 digits = datasets.load_digits() X = digits.data y = digits.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
knn = KNeighborsClassifier() knn.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')
knn.score(X_train, y_train)
0.9866369710467706
pca = PCA(n_components=2) pca.fit(X_train) X_train_reduction = pca.transform(X_train) X_test_reduction = pca.transform(X_test)
begin = time.time() knn = KNeighborsClassifier() knn.fit(X_train_reduction, y_train) end = time.time() print('訓練耗時:{}'.format(end-begin))
訓練耗時:0.0011568069458007812
knn.score(X_test_reduction, y_test)
0.6266666666666667
pca.explained_variance_ratio_
array([0.14566794, 0.13448185])
pca = PCA(n_components=X_train.shape[1]) pca.fit(X_train) pca.explained_variance_ratio_
array([1.45667940e-01, 1.34481846e-01, 1.19590806e-01, 8.63833775e-02, 5.90548655e-02, 4.89518409e-02, 4.31561171e-02, 3.63466115e-02, 3.41098378e-02, 3.03787911e-02, 2.38923779e-02, 2.24613809e-02, 1.81136494e-02, 1.81125785e-02, 1.51771863e-02, 1.39510696e-02, 1.32079987e-02, 1.21938163e-02, 9.95264723e-03, 9.39755156e-03, 9.02644073e-03, 7.96537048e-03, 7.64762648e-03, 7.10249621e-03, 7.04448539e-03, 5.89513570e-03, 5.65827618e-03, 5.08671500e-03, 4.97354466e-03, 4.32832415e-03, 3.72181436e-03, 3.42451450e-03, 3.34729452e-03, 3.20924019e-03, 3.03301292e-03, 2.98738373e-03, 2.61397965e-03, 2.28591480e-03, 2.21699566e-03, 2.14081498e-03, 1.86018920e-03, 1.57568319e-03, 1.49171335e-03, 1.46157540e-03, 1.17829304e-03, 1.06805854e-03, 9.41934676e-04, 7.76116004e-04, 5.59378443e-04, 3.65463486e-04, 1.71625943e-04, 8.78242589e-05, 5.20662123e-05, 5.19689192e-05, 4.16826522e-05, 1.50475650e-05, 4.42917130e-06, 3.53610879e-06, 7.14554374e-07, 6.80092943e-07, 3.48757835e-07, 8.17776361e-34, 8.17776361e-34, 7.97764241e-34])
plt.plot([i for i in range(X_train.shape[1])], [np.sum(pca.explained_variance_ratio_[:i+1]) for i in range(X_train.shape[1])],c='r') plt.xlabel('前n個主成分',fontproperties=font) plt.ylabel('前n個主成分方差所佔比例',fontproperties=font) plt.show()
經過上圖能夠肯定取多少比例的主成分能平衡模型的準確率和訓練速度。git
# 0.95表示保留原始維度的80%的維度 pca = PCA(0.80) pca.fit(X_train)
PCA(copy=True, iterated_power='auto', n_components=0.8, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
pca.n_components_
13
X_train_reduction = pca.transform(X_train) X_test_reduction = pca.transform(X_test)
begin = time.time() knn = KNeighborsClassifier() knn.fit(X_train_reduction, y_train) end = time.time() print('訓練耗時:{}'.format(end-begin))
訓練耗時:0.004214048385620117
knn.score(X_test_reduction, y_test)
0.9844444444444445
主成分分析做爲降維的做用,可是若是過度降維,降維到2維的時候能夠看到模型的準確率很是低;若是降維到80%左右,準確度沒有什麼太大的影響。因爲數據量過少,因此降維的優勢即模型訓練速度加快的優點並無體現出來,可是在工業上PCA必定是經過丟失一部分信息+下降模型準確度換取模型訓練速度。算法