import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#新學的庫
# 下載Minist數據集
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
# 設置種子
from sklearn.utils import check_random_state
# 記錄運行時間
import time
t0 = time.time()
train_samples = 5000
X ,y = fetch_openml('mnist_784',version=1,return_X_y = True)
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape(X.shape[0],-1)
解釋:我感受上面這步頗有技巧,permutation就是對數組進行重排,因此這裏直接對X的大小進行重排,重排後的數組位置就直接能夠視爲Index。css
print(X.shape)
print(y.shape )
這裏給訓練數據劃分爲:5000,而測試數據則爲10000。html
X_train,X_test,y_train,y_test = train_test_split(X,y,
train_size=train_samples,
test_size = 10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
注:以前在Pipeline那個章節的時候已經說過了這種轉換了,當時沒有理解,如今原來是先fit_transform,以後根據以前已經設置的標準化數據(應該是訓練數據的均值和方差吧)html5
clf = LogisticRegression(C = 50./train_samples,
multi_class='multinomial',
penalty='l1',solver='saga',tol=0.1)
clf.fit(X_train,y_train)
# 計算稀疏度,統計出多少權重係數的值是0
sparsity = np.mean(clf.coef_==0)*100
score = clf.score(X_test,y_test)
#print('最好的C值爲:%.4f' % clf.C_)
print('L1正則化的的稀疏度:%.2f%%' % sparsity)
print('L1正則化的測試分數:%.4f'% score)
這裏有個編程很是好,經過統計布爾值的個數,計算均值 np.mean(clf.coef_==0)
java
其中:Score 計算的數據的準確率
。node
首先Logistics迴歸
作的是分類的任務,可是輸出的屬性與迴歸相似。python
本例中有784個像素,即784個屬性特徵,每一個屬性配一個權重。對於每張照片每一個像素都有值。jquery
$$ z = w_0 + w_1*x_1+ w_2*x_2+\cdots+w_{784}*x_{784} $$這裏coef_
返回的就是w
值,以後將回歸後的值,再通過相似於階梯的判斷(即sigmoid函數)的分類:linux
coef = clf.coef_.copy()
print(coef)
scale = np.abs(coef).max()
print(scale)
print(coef.shape)
plt.figure(figsize=(14,7))
for i in range(10):
l1_plot = plt.subplot(2,5,i+1)
l1_plot.imshow(coef[i].reshape(28,28),interpolation='bilinear',
cmap=plt.cm.RdBu,vmin=-scale,vmax=scale)
l1_plot.set_xticks(())
l1_plot.set_yticks(())
l1_plot.set_xlabel('類別:%i' %i)
plt.suptitle('分類向量爲')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用來正常顯示中文標籤
plt.rcParams['axes.unicode_minus'] = False # 用來正常顯示負號
run_time = time.time()-t0
print('該案例運行時間爲:%.3f s' % run_time)
plt.show()