模型驗證

時間 2020-06-17

標籤模型驗證简体版

原文原文鏈接

對分類模型的檢驗git

加載數據app

 1 %matplotlib notebook  2 import numpy as np  3 import pandas as pd  4 import seaborn as sns  5 import matplotlib.pyplot as plt  6 from sklearn.model_selection import train_test_split  7 from sklearn.datasets import load_digits  8 
 9 dataset = load_digits() 10 X, y = dataset.data, dataset.target 11 #統計每一個種類的個數
12 for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)): 13     print(class_name,class_count)

1 # 進行一個數據之間的轉換 
2 # Negative class (0) is 'not digit 1' 
3 # Positive class (1) is 'digit 1'
4 y_binary_imbalanced = y.copy() 5 y_binary_imbalanced[y_binary_imbalanced != 1] = 0 6 
7 print('Original labels:\t', y[1:30]) 8 print('New binary labels:\t', y_binary_imbalanced[1:30])

Original labels:	 [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
New binary labels:	 [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]

1 #np.bincount:用於統計每一個索引的總個數
2 np.bincount(y_binary_imbalanced)    # Negative class (0) is the most frequent class

array([1615,  182])
（索引爲0的個數爲：1615，索引爲1的個數爲：182,在這種狀況下，比例徹底不平衡，inbalanced classes）


使用RBF核函數SVM來創建分類模型

1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 
3 # Accuracy of Support Vector Machine classifier
4 from sklearn.svm import SVC 5 
6 svm = SVC(kernel='rbf', C=1).fit(X_train, y_train) 7 svm.score(X_test, y_test)

0.90888888888888886

DummyClassifier是一個使用簡單規則進行預測的分類器，它能夠用做與實際分類器進行比較
的基準，尤爲是對於不平衡的類。不能用於實際問題。

1 from sklearn.dummy import DummyClassifier 2 
3 # Negative class (0) is most frequent
4 #使用策略(strategy）大頻率來進行擬合
5 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 6 # Therefore the dummy 'most_frequent' classifier always predicts class 0
7 y_dummy_predictions = dummy_majority.predict(X_test) 8 
9 y_dummy_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

1 dummy_majority.score(X_test, y_test)

0.9044444444444445

1 svm = SVC(kernel='linear', C=1).fit(X_train, y_train) 2 svm.score(X_test, y_test)

0.97777777777777775

混淆矩陣

1 from sklearn.metrics import confusion_matrix 2 
3 # Negative class (0) is most frequent
4 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 5 y_majority_predicted = dummy_majority.predict(X_test) 6 #產生混淆矩陣
7 confusion = confusion_matrix(y_test, y_majority_predicted) 8 
9 print('Most frequent class (dummy classifier)\n', confusion)

1 from sklearn.metrics import confusion_matrix 2 
3 # Negative class (0) is most frequent
4 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 5 y_majority_predicted = dummy_majority.predict(X_test) 6 #產生混淆矩陣
7 confusion = confusion_matrix(y_test, y_majority_predicted) 8 
9 print('Most frequent class (dummy classifier)\n', confusion)

Most frequent class (dummy classifier)
 [[407   0]
 [ 43   0]]

1 # produces random predictions w/ same class proportion as training set
2 dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train) 3 y_classprop_predicted = dummy_classprop.predict(X_test) 4 confusion = confusion_matrix(y_test, y_classprop_predicted) 5 
6 print('Random class-proportional prediction (dummy classifier)\n', confusion)

Random class-proportional prediction (dummy classifier)
 [[361  46]
 [ 39   4]]

1 svm = SVC(kernel='linear', C=1).fit(X_train, y_train) 2 svm_predicted = svm.predict(X_test) 3 confusion = confusion_matrix(y_test, svm_predicted) 4 
5 print('Support vector machine classifier (linear kernel, C=1)\n', confusion)

Support vector machine classifier (linear kernel, C=1)
 [[402   5]
 [  5  38]]

1 from sklearn.linear_model import LogisticRegression 2 
3 lr = LogisticRegression().fit(X_train, y_train) 4 lr_predicted = lr.predict(X_test) 5 confusion = confusion_matrix(y_test, lr_predicted) 6 
7 print('Logistic regression classifier (default settings)\n', confusion)

Logistic regression classifier (default settings)
 [[401   6]
 [  6  37]]

1 from sklearn.tree import DecisionTreeClassifier 2 
3 dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train) 4 tree_predicted = dt.predict(X_test) 5 confusion = confusion_matrix(y_test, tree_predicted) 6 
7 print('Decision tree classifier (max_depth = 2)\n', confusion)

Decision tree classifier (max_depth = 2)
 [[400   7]
 [ 17  26]]

二元分類的評估

1 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 2 # Accuracy = TP + TN / (TP + TN + FP + FN)
3 # Precision = TP / (TP + FP)
4 # Recall = TP / (TP + FN) Also known as sensitivity, or True Positive Rate
5 # F1 = 2 * Precision * Recall / (Precision + Recall) 
6 print('Accuracy: {:.2f}'.format(accuracy_score(y_test, tree_predicted))) 7 print('Precision: {:.2f}'.format(precision_score(y_test, tree_predicted))) 8 print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted))) 9 print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))

Accuracy: 0.95
Precision: 0.79
Recall: 0.60
F1: 0.68

綜合報告

1 # Combined report with all above metrics
2 from sklearn.metrics import classification_report 3 
4 print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))

　　　　　　　　　precision    recall  f1-score   support

      not 1       0.96      0.98      0.97       407
          1       0.79      0.60      0.68        43

avg / total       0.94      0.95      0.94       450

1 print('Random class-proportional (dummy)\n', 2       classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1'])) 3 print('SVM\n', 4       classification_report(y_test, svm_predicted, target_names = ['not 1', '1'])) 5 print('Logistic regression\n', 6       classification_report(y_test, lr_predicted, target_names = ['not 1', '1'])) 7 print('Decision tree\n', 8       classification_report(y_test, tree_predicted, target_names = ['not 1', '1']))

Random class-proportional (dummy)
              precision    recall  f1-score   support

      not 1       0.90      0.89      0.89       407
          1       0.08      0.09      0.09        43

avg / total       0.82      0.81      0.82       450

SVM
              precision    recall  f1-score   support

      not 1       0.99      0.99      0.99       407
          1       0.88      0.88      0.88        43

avg / total       0.98      0.98      0.98       450

Logistic regression
              precision    recall  f1-score   support

      not 1       0.99      0.99      0.99       407
          1       0.86      0.86      0.86        43

avg / total       0.97      0.97      0.97       450

Decision tree
              precision    recall  f1-score   support

      not 1       0.96      0.98      0.97       407
          1       0.79      0.60      0.68        43

avg / total       0.94      0.95      0.94       450


Decision functions（相似cost functions，用於評價樣本預測）

1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test) 3 y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20])) 4 
5 # show the decision_function scores for first 20 instances
6 y_score_list

[(0, -23.172292973469549),
 (0, -13.542576515500066),
 (0, -21.717588760007864),
 (0, -18.903065133316442),
 (0, -19.733169947138638),
 (0, -9.7463217496747667),
 (1, 5.2327155658831117),
 (0, -19.308012306288916),
 (0, -25.099330209728528),
 (0, -21.824312362996),
 (0, -24.143782750720494),
 (0, -19.578811099762504),
 (0, -22.568371393280199),
 (0, -10.822590225240777),
 (0, -11.907918741521936),
 (0, -10.977026853802803),
 (1, 11.206811164226373),
 (0, -27.644157619807473),
 (0, -12.857692102545419),
 (0, -25.848149140240199)]

#predict_proba()預測爲1的可能性
1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test) 3 y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1])) 4 
5 # show the probability of positive class for first 20 instances
6 y_proba_list

[(0, 8.6377579220606466e-11),
 (0, 1.3138118599563736e-06),
 (0, 3.6997386039099659e-10),
 (0, 6.1730972504865241e-09),
 (0, 2.6914925394345074e-09),
 (0, 5.8506057771143608e-05),
 (1, 0.99468934644404694),
 (0, 4.1175302368500096e-09),
 (0, 1.2574750894253029e-11),
 (0, 3.3252290754668869e-10),
 (0, 3.269552979937297e-11),
 (0, 3.1407283576084996e-09),
 (0, 1.5800864117150149e-10),
 (0, 1.9943442430612578e-05),
 (0, 6.7368003023859777e-06),
 (0, 1.7089540581641637e-05),
 (1, 0.9999864188091131),
 (0, 9.8694940340196163e-13),
 (0, 2.6059983600823614e-06),
 (0, 5.9469113009063784e-12)]

Precision-recall curves

 1 from sklearn.metrics import precision_recall_curve  2 
 3 precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)  4 closest_zero = np.argmin(np.abs(thresholds))  5 closest_zero_p = precision[closest_zero]  6 closest_zero_r = recall[closest_zero]  7 
 8 plt.figure()  9 plt.xlim([0.0, 1.01]) 10 plt.ylim([0.0, 1.01]) 11 plt.plot(precision, recall, label='Precision-Recall Curve') 12 plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3) 13 plt.xlabel('Precision', fontsize=16) 14 plt.ylabel('Recall', fontsize=16) 15 plt.axes().set_aspect('equal') 16 plt.show()

ROC curves, Area-Under-Curve (AUC)

 1 from sklearn.metrics import roc_curve, auc  2 
 3 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)  4 
 5 y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)  6 fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)  7 roc_auc_lr = auc(fpr_lr, tpr_lr)  8 
 9 plt.figure() 10 plt.xlim([-0.01, 1.00]) 11 plt.ylim([-0.01, 1.01]) 12 plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr)) 13 plt.xlabel('False Positive Rate', fontsize=16) 14 plt.ylabel('True Positive Rate', fontsize=16) 15 plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16) 16 plt.legend(loc='lower right', fontsize=13) 17 plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--') 18 plt.axes().set_aspect('equal') 19 plt.show()

 1 from matplotlib import cm  2 
 3 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)  4 
 5 plt.figure()  6 plt.xlim([-0.01, 1.00])  7 plt.ylim([-0.01, 1.01])  8 for g in [0.01, 0.1, 0.20, 1]:  9     svm = SVC(gamma=g).fit(X_train, y_train) 10     y_score_svm = svm.decision_function(X_test) 11     fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm) 12     roc_auc_svm = auc(fpr_svm, tpr_svm) 13     accuracy_svm = svm.score(X_test, y_test) 14     print("gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}".format(g, accuracy_svm, 15  roc_auc_svm)) 16     plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, 17              label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm)) 18 
19 plt.xlabel('False Positive Rate', fontsize=16) 20 plt.ylabel('True Positive Rate (Recall)', fontsize=16) 21 plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--') 22 plt.legend(loc="lower right", fontsize=11) 23 plt.title('ROC curve: (1-of-10 digits classifier)', fontsize=16) 24 plt.axes().set_aspect('equal') 25 
26 plt.show()

gamma = 0.01  accuracy = 0.91   AUC = 1.00
gamma = 0.10  accuracy = 0.90   AUC = 0.98
gamma = 0.20  accuracy = 0.90   AUC = 0.66
gamma = 1.00  accuracy = 0.90   AUC = 0.50

對多分類模型的驗證方法

多分類模型的混淆矩陣

 1 dataset = load_digits()  2 X, y = dataset.data, dataset.target  3 X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, random_state=0)  4 
 5 
 6 svm = SVC(kernel = 'linear').fit(X_train_mc, y_train_mc)  7 svm_predicted_mc = svm.predict(X_test_mc)  8 confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)  9 df_cm = pd.DataFrame(confusion_mc, 10                      index = [i for i in range(0,10)], columns = [i for i in range(0,10)]) 11 
12 plt.figure(figsize=(5.5,4)) 13 sns.heatmap(df_cm, annot=True) 14 plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, 15  svm_predicted_mc))) 16 plt.ylabel('True label') 17 plt.xlabel('Predicted label') 18 
19 
20 svm = SVC(kernel = 'rbf').fit(X_train_mc, y_train_mc) 21 svm_predicted_mc = svm.predict(X_test_mc) 22 confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc) 23 df_cm = pd.DataFrame(confusion_mc, index = [i for i in range(0,10)], 24                   columns = [i for i in range(0,10)]) 25 
26 plt.figure(figsize = (5.5,4)) 27 sns.heatmap(df_cm, annot=True) 28 plt.title('SVM RBF Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, 29  svm_predicted_mc))) 30 plt.ylabel('True label') 31 plt.xlabel('Predicted label');

多分類模型的報告dom

1 print(classification_report(y_test_mc, svm_predicted_mc))

precision    recall  f1-score   support

          0       1.00      0.65      0.79        37
          1       1.00      0.23      0.38        43
          2       1.00      0.39      0.56        44
          3       1.00      0.93      0.97        45
          4       0.14      1.00      0.25        38
          5       1.00      0.33      0.50        48
          6       1.00      0.54      0.70        52
          7       1.00      0.35      0.52        48
          8       1.00      0.02      0.04        48
          9       1.00      0.55      0.71        47

avg / total       0.93      0.49      0.54       450

微觀平均指標與宏觀平均指標函數

1 print('Micro-averaged precision = {:.2f} (treat instances equally)'
2       .format(precision_score(y_test_mc, svm_predicted_mc, average = 'micro'))) 3 print('Macro-averaged precision = {:.2f} (treat classes equally)'
4       .format(precision_score(y_test_mc, svm_predicted_mc, average = 'macro')))

Micro-averaged precision = 0.49 (treat instances equally)
Macro-averaged precision = 0.91 (treat classes equally)

1 print('Micro-averaged f1 = {:.2f} (treat instances equally)'
2       .format(f1_score(y_test_mc, svm_predicted_mc, average = 'micro'))) 3 print('Macro-averaged f1 = {:.2f} (treat classes equally)'
4       .format(f1_score(y_test_mc, svm_predicted_mc, average = 'macro')))

Micro-averaged f1 = 0.49 (treat instances equally)
Macro-averaged f1 = 0.54 (treat classes equally)

迴歸模型評估指標優化

 1 %matplotlib notebook  2 import matplotlib.pyplot as plt  3 import numpy as np  4 from sklearn.model_selection import train_test_split  5 from sklearn import datasets  6 from sklearn.linear_model import LinearRegression  7 from sklearn.metrics import mean_squared_error, r2_score  8 from sklearn.dummy import DummyRegressor  9 
10 diabetes = datasets.load_diabetes() 11 
12 X = diabetes.data[:, None, 6] 13 y = diabetes.target 14 
15 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 16 
17 lm = LinearRegression().fit(X_train, y_train) 18 lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train) 19 
20 y_predict = lm.predict(X_test) 21 y_predict_dummy_mean = lm_dummy_mean.predict(X_test) 22 
23 print('Linear model, coefficients: ', lm.coef_) 24 print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 25  y_predict_dummy_mean))) 26 print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict))) 27 print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean))) 28 print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict))) 29 
30 # Plot outputs
31 plt.scatter(X_test, y_test,  color='black') 32 plt.plot(X_test, y_predict, color='green', linewidth=2) 33 plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', 34          linewidth=2, label = 'dummy') 35 
36 plt.show()

Linear model, coefficients:  [-698.80206267]
Mean squared error (dummy): 4965.13
Mean squared error (linear model): 4646.74
r2_score (dummy): -0.00
r2_score (linear model): 0.06

使用評估指標進行模型選擇this

交叉驗證例子lua

 1 from sklearn.model_selection import cross_val_score  2 from sklearn.svm import SVC  3 
 4 dataset = load_digits()  5 # again, making this a binary problem with 'digit 1' as positive class 
 6 # and 'not 1' as negative class
 7 X, y = dataset.data, dataset.target == 1
 8 clf = SVC(kernel='linear', C=1)  9 
10 # accuracy is the default scoring metric
11 print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5)) 12 # use AUC as scoring metric
13 print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc')) 14 # use recall as scoring metric
15 print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))

Cross-validation (accuracy) [ 0.91944444  0.98611111  0.97214485  0.97493036  0.96935933]
Cross-validation (AUC) [ 0.9641871   0.9976571   0.99372205  0.99699002  0.98675611]
Cross-validation (recall) [ 0.81081081  0.89189189  0.83333333  0.83333333  0.83333333]

網格搜索示例spa

 1 from sklearn.svm import SVC  2 from sklearn.model_selection import GridSearchCV  3 from sklearn.metrics import roc_auc_score  4 
 5 dataset = load_digits()  6 X, y = dataset.data, dataset.target == 1
 7 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)  8 
 9 clf = SVC(kernel='rbf') 10 grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]} 11 
12 # default metric to optimize over grid parameters: accuracy
13 grid_clf_acc = GridSearchCV(clf, param_grid = grid_values) 14 grid_clf_acc.fit(X_train, y_train) 15 y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 16 
17 print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_) 18 print('Grid best score (accuracy): ', grid_clf_acc.best_score_) 19 
20 # alternative metric to optimize over grid parameters: AUC
21 grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc') 22 grid_clf_auc.fit(X_train, y_train) 23 y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 24 
25 print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc)) 26 print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_) 27 print('Grid best score (AUC): ', grid_clf_auc.best_score_)

Grid best parameter (max. accuracy):  {'gamma': 0.001}
Grid best score (accuracy):  0.996288047513
Test set AUC:  0.999828581224
Grid best parameter (max. AUC):  {'gamma': 0.001}
Grid best score (AUC):  0.99987412783

1 #Evaluation metrics supported for model selection
2 from sklearn.metrics.scorer import SCORERS 3 
4 print(sorted(list(SCORERS.keys())))

['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 
'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 
'median_absolute_error', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_square
d_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_mic
ro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 
'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']

使用數字數據集的雙特徵分類示例3d

使用不一樣的評估指標優化分類器code

 1 from sklearn.datasets import load_digits  2 from sklearn.model_selection import train_test_split  3 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot  4 from sklearn.svm import SVC  5 from sklearn.model_selection import GridSearchCV  6 
 7 
 8 dataset = load_digits()  9 X, y = dataset.data, dataset.target == 1
10 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 11 
12 # Create a two-feature input vector matching the example plot above
13 # We jitter the points (add a small amount of random noise) in case there are areas
14 # in feature space where many instances have the same features.
15 jitter_delta = 0.25
16 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta 17 X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta 18 
19 clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train) 20 grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]} 21 plt.figure(figsize=(9,6)) 22 for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')): 23     grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric) 24  grid_clf_custom.fit(X_twovar_train, y_train) 25     print('Grid best parameter (max. {0}): {1}'
26  .format(eval_metric, grid_clf_custom.best_params_)) 27     print('Grid best score ({0}): {1}'
28  .format(eval_metric, grid_clf_custom.best_score_)) 29     plt.subplots_adjust(wspace=0.3, hspace=0.3) 30  plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None, 31                                              None, None,  plt.subplot(2, 2, i+1)) 32     
33     plt.title(eval_metric+'-oriented SVC') 34 plt.tight_layout() 35 plt.show()

Grid best parameter (max. precision): {'class_weight': {1: 2}}
Grid best score (precision): 0.5379994354058584
Grid best parameter (max. recall): {'class_weight': {1: 50}}
Grid best score (recall): 0.921184706893106
Grid best parameter (max. f1): {'class_weight': {1: 3}}
Grid best score (f1): 0.5079935126308859
Grid best parameter (max. roc_auc): {'class_weight': {1: 20}}
Grid best score (roc_auc): 0.8889416320163174

默認SVC分類器的精確召回曲線（平衡類別權重）

 1 from sklearn.model_selection import train_test_split  2 from sklearn.metrics import precision_recall_curve  3 from adspy_shared_utilities import plot_class_regions_for_classifier  4 from sklearn.svm import SVC  5 
 6 dataset = load_digits()  7 X, y = dataset.data, dataset.target == 1
 8 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)  9 
10 # create a two-feature input vector matching the example plot above
11 jitter_delta = 0.25
12 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta 13 X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta 14 
15 clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train) 16 
17 y_scores = clf.decision_function(X_twovar_test) 18 
19 precision, recall, thresholds = precision_recall_curve(y_test, y_scores) 20 closest_zero = np.argmin(np.abs(thresholds)) 21 closest_zero_p = precision[closest_zero] 22 closest_zero_r = recall[closest_zero] 23 
24 plot_class_regions_for_classifier(clf, X_twovar_test, y_test) 25 plt.title("SVC, class_weight = 'balanced', optimized for accuracy") 26 plt.show() 27 
28 plt.figure() 29 plt.xlim([0.0, 1.01]) 30 plt.ylim([0.0, 1.01]) 31 plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'") 32 plt.plot(precision, recall, label = 'Precision-Recall Curve') 33 plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3) 34 plt.xlabel('Precision', fontsize=16) 35 plt.ylabel('Recall', fontsize=16) 36 plt.axes().set_aspect('equal') 37 plt.show() 38 print('At zero threshold, precision: {:.2f}, recall: {:.2f}'
39       .format(closest_zero_p, closest_zero_r))

At zero threshold, precision: 0.22, recall: 0.74

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。