Notes : Chapter 3

Chapter 3-Classificationjavascript

 

 

 

MNIST

 
  • MNIST is a dataset which has 70,000 small images
  • "Hello World" of Machine Learning
In [1]:
# fetch MNIST, 
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
#可是老是顯示下載失敗,下載mnist-original.mat到~/scikit_learn_data/mldata/內。
#mldata.org//google
 
  • A DESCR key describing the dataset
  • A data key containing an array with one row per instance and one column per feature
  • A target containing an array with the labels
In [2]:
X, y = mnist["data"],mnist["target"]
print(X.shape,y.shape) #784 = 28pixels x 28pixels from 0-255(white-black)
 
(70000, 784) (70000,)
In [3]:
#show
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[12345]
some_digit_image = some_digit.reshape(28,28)

plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis('off')
plt.show()
 
In [4]:
# EXTRA
import numpy as np
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances] #轉換成100個像素陣
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))      #實現list的reshape
    image = np.concatenate(row_images, axis=0)  
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]  #把這100個圖連起來
plot_digits(example_images, images_per_row=10)
plt.show()
 
In [5]:
y[12345]
Out[5]:
1.0
 
  • shuffle the train set
    • similar cross-validation folds
    • some algorithms sensitive to instance's order , similar instances in a row performs poorly
In [6]:
import numpy as np
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
 

Train a Binary Classifer

 
  • 判斷一張圖是否是某個數字就是一個 Binary Classifer問題。 如: 5 or not-5
  • Stochastic Grandient Descant(SGD) Classifer 隨機梯度降低分類
    • train instance independently
In [7]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_5)

sgd_clf.predict([X[36000]])
Out[7]:
array([ True], dtype=bool)
 

Preformance Measures

 

Measuing Accuracy Using Cross-Validationcss

  1. need more control
In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
#StratifiedKFold performs stratified sampling
skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_current = sum(y_pred == y_test_fold)
    print(n_current/len(y_pred))
 
0.953
0.9525
0.95515
In [9]:
# use cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')
Out[9]:
array([ 0.953  ,  0.9525 ,  0.95515])
 

這並不表明精確度高,由於即便全爲no-5s的交叉驗證的正確率也有90%html

In [10]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring='accuracy')
Out[10]:
array([ 0.90825,  0.9112 ,  0.9095 ])
 

Confusion Matrixhtml5

In [11]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, y_train_pred)
Out[11]:
array([[53207,  1372],
       [ 1415,  4006]])
In [12]:
y_train_perfect_predictions = y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)
Out[12]:
array([[54579,     0],
       [    0,  5421]])
 

Precision and Recallpython

In [13]:
from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))
 
0.744886574935
0.738978048331
In [14]:
# f1 is the harmonic mean
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
Out[14]:
0.7419205481989074
 

f1 favor classifier that has similar precision and recall</br> 但狀況並不老是這樣</br> 寧肯錯殺一百,不可放過一個:low recall, high precision , 如:視頻等級劃分;或者狀況相反:如抓小偷</br>jquery

 

Precision / Recall Tradeofflinux

 
  1. lowing the threshold increase recall and reduce precision
  2. sklearn doesn't let you set the threshold directly and give you access to the decision secores(use to prediction)
In [15]:
some_digit_index = 36000
some_digit = X[some_digit_index]
y_scores = sgd_clf.decision_function([some_digit])
y_scores
Out[15]:
array([ 45981.28253526])
In [16]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
Out[16]:
array([ True], dtype=bool)
In [17]:
threshold = 200000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
Out[17]:
array([False], dtype=bool)
 

decide which threshlod to useandroid

In [18]:
#使用交叉驗證獲取分數
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
In [19]:
from sklearn.metrics import precision_recall_curve
#計算全部precision和recall
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
In [20]:
#畫出來
def plot_precision_recall_vs_threshold(precisions, recalls, threshold):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0,1])
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
 
In [21]:
y_train_pred_90 = (y_scores > 250000)
precision_score(y_train_5, y_train_pred_90)
Out[21]:
0.96514161220043571
In [22]:
recall_score(y_train_5, y_train_pred_90)
Out[22]:
0.32687695997048516
 

just set the a high enough threshold to creat a classifier with virtually any precisioncss3

 

ROC 受試者工做特徵曲線 : the true positive rate against the false positive rate

In [23]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr, tpr)
plt.show()
 
In [24]:
# compute the area under the curve(AUC)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)
Out[24]:
0.9568006259068953
 
  1. positive calss is rare or more care the false negatives use the PR curve
  2. otherwise use the ROC(ROC, AUC)
  3. sklearn give decision_function() or predict_proba()(return an array containing an row per instance and a column per class, each containing the probability that the given instance belongs to the given calss)
In [25]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state = 42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
In [26]:
# use the posobility as the scores
y_scores_forest = y_probas_forest[:,1]
fprs_forest, tprs_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
In [27]:
plt.plot(fpr, tpr, 'b:', label="SGD")
plot_roc_curve(fprs_forest, tprs_forest, "Random forest")
plt.legend(loc='bottom right')
plt.show()
 
/usr/local/lib/python3.5/dist-packages/matplotlib/legend.py:326: UserWarning: Unrecognized location "bottom right". Falling back on "best"; valid locations are
	lower left
	center right
	upper right
	center
	right
	upper center
	lower right
	upper left
	center left
	lower center
	best

  six.iterkeys(self.codes))))
 
In [28]:
roc_auc_score(y_train_5, y_scores_forest)
Out[28]:
0.99114321301880992
 
  1. how to train binary classifier
  2. choose metric for task
  3. evaluate your classifiers using cross-validation
  4. select the Precision/Recall tradeoff that fits your needs and compare various medel using ROC curve and ROC/AUC scores
 

Multiclass Classification

 
  1. 有些算法自己支持多分類
  2. 也可以使用多個二分類代替的策略
    1. 多個二分類,要分類時,每一個都進行分類,選最高分(OvA)
    2. 爲每一對訓練一個分類,如:1-2,1-3,...,9-8,...,一共須要N(N-1)/2,稱爲one versus one(OvO)
    3. 一些數據集規模和算法規模關聯性不強的使用OvO,如:SVM;其餘的使用OvA
    4. sklearn在使用二分類處理多分類時,自動合適的使用OvA或者OvO
In [29]:
#try SGDClassifier
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
Out[29]:
array([ 5.])
In [30]:
some_digit_secores = sgd_clf.decision_function([some_digit])
some_digit_secores
Out[30]:
array([[-305117.56076994, -572405.6562905 , -386686.20587505,
        -198578.92561098, -312977.5748752 ,   45981.28253526,
        -752588.92027703, -425193.41816061, -692575.39314386,
        -732446.97820597]])
In [31]:
np.argmax(some_digit_secores)
Out[31]:
5
In [32]:
sgd_clf.classes_
Out[32]:
array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
In [33]:
sgd_clf.classes_[5]  #巧了
Out[33]:
5.0
In [34]:
#force sklearn to use OvO or OvA: use OneVsOneClassifer or OneVsRestClassifer
from sklearn.multiclass import OneVsOneClassifier

ovo_clf=OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
Out[34]:
array([ 5.])
In [35]:
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
Out[35]:
array([ 5.])
In [36]:
forest_clf.predict_proba([some_digit])
Out[36]:
array([[ 0. ,  0. ,  0. ,  0. ,  0.1,  0.9,  0. ,  0. ,  0. ,  0. ]])
In [37]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')
Out[37]:
array([ 0.87037592,  0.88059403,  0.84912737])
In [38]:
#簡單的對輸入的縮放:StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
Out[38]:
array([ 0.91071786,  0.90684534,  0.91233685])
 

Error Analysis

 
  1. look at the confusion matrix
  2. plot on the errors
    1. divide each value in confusion matrix by number of images in the corresopnding class
    2. fill the diagonals with zeros to keep only the errors
In [39]:
#1-1
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx
Out[39]:
array([[5729,    2,   23,    8,   11,   50,   49,    9,   40,    2],
       [   1, 6505,   42,   21,    6,   40,    6,   10,  100,   11],
       [  53,   41, 5336,  102,   81,   26,   84,   67,  154,   14],
       [  45,   45,  140, 5359,    6,  220,   36,   49,  134,   97],
       [  16,   30,   38,   10, 5361,   11,   50,   33,   77,  216],
       [  73,   41,   34,  184,   73, 4588,  104,   32,  195,   97],
       [  31,   28,   51,    1,   51,   86, 5613,    8,   49,    0],
       [  22,   21,   70,   30,   55,   12,    5, 5815,   18,  217],
       [  49,  173,   74,  151,   14,  153,   55,   21, 5021,  140],
       [  43,   37,   25,   85,  166,   32,    3,  204,   83, 5271]])
In [40]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
 
 

most images are on the main diagonal which means that they were classified correctly and 5s is darker means fewer 5s images in the dataset or classifier doesn't perform well

In [41]:
#2-1
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx/row_sums
In [42]:
#2-2
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
 
 
  1. row represent the actual classes
  2. improve 8s, 9s, 3/5
    1. count the number of close loops
In [43]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221)
plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222)
plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223)
plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224)
plot_digits(X_bb[:25], images_per_row=5)
plt.show()
 
 

看到頂部的直線的底部的弧線中間的鏈接方式:偏向左邊一條直線就是5,偏向右邊就是3

 

Multilabel Classification

In [44]:
from sklearn.neighbors import KNeighborsClassifier  #support multilabel classification

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
Out[44]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
In [45]:
knn_clf.predict([some_digit])
Out[45]:
array([[False,  True]], dtype=bool)
In [46]:
#evaluate by f1 score
from sklearn.metrics import f1_score

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)

f1_score(y_train, y_train_knn_pred, average='macro')
 
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
KeyboardInterrupt: 
 

Multioutput Classification

In [53]:
import numpy.random as rnd
noise = rnd.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
y_train_mod = X_train
noise = rnd.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_test_mod = X_test
In [54]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = matplotlib.cm.binary,
               interpolation="nearest")
    plt.axis("off")
    
some_index = 5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
plt.show()
 
In [57]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit=knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)
 
In [ ]:
相關文章
相關標籤/搜索