import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt df = pd.read_csv('./sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label'], random_state=11) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) scores = cross_val_score(classifier, X_train, y_train, cv=5) print('Accuracies: %s' % scores) print('Mean accuracy: %s' % np.mean(scores))
Accuracies: [ 0.95221027 0.95454545 0.96172249 0.96052632 0.95209581] Mean accuracy: 0.956220068309
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') print('Precision: %s' % np.mean(precisions)) recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall') print('Recall: %s' % np.mean(recalls)) f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1') print('F1 score: %s' % np.mean(f1s))
Precision: 0.992542742398 Recall: 0.683605030275
F1 score: 0.809067846627
F1是精確率和召回率的調和平均值。若是精確度爲1,召回爲0,那F1爲0.還有F0.5和F2兩種模型,分別偏重精確率和召回率。在一些場景下,召回率比精確率還更重要。
經常使用分類的對比
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report X, y = make_classification( n_samples=5000, n_features=100, n_informative=20, n_clusters_per_class=2, random_state=11) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11) print('決策樹') clf = DecisionTreeClassifier(random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('隨機森林') clf = RandomForestClassifier(n_estimators=10, random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('邏輯迴歸') clf = LogisticRegression() clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('AdaBoost') clf = AdaBoostClassifier(n_estimators=50, random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('KNN近鄰') clf = KNeighborsClassifier(n_neighbors=3) clf.fit(X_train,y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('SVM支持向量機') clf = SVC(kernel='rbf', C=100, gamma=0.1).fit(X, y) predictions = clf.predict(X_test) print(classification_report(y_test, predictions))
結果
決策樹 precision recall f1-score support 0 0.80 0.76 0.78 634 1 0.76 0.80 0.78 616 accuracy 0.78 1250 macro avg 0.78 0.78 0.78 1250 weighted avg 0.78 0.78 0.78 1250 隨機森林 precision recall f1-score support 0 0.79 0.86 0.82 634 1 0.84 0.76 0.80 616 accuracy 0.81 1250 macro avg 0.82 0.81 0.81 1250 weighted avg 0.82 0.81 0.81 1250 邏輯迴歸 precision recall f1-score support 0 0.82 0.85 0.84 634 1 0.84 0.81 0.83 616 accuracy 0.83 1250 macro avg 0.83 0.83 0.83 1250 weighted avg 0.83 0.83 0.83 1250
AdaBoost
precision recall f1-score supportdom
0 0.83 0.85 0.84 634
1 0.84 0.82 0.83 616spa
accuracy 0.83 1250
macro avg 0.83 0.83 0.83 1250
weighted avg 0.83 0.83 0.83 1250rest
KNN近鄰
precision recall f1-score supportcode
0 0.93 0.93 0.93 634
1 0.93 0.93 0.93 616orm
accuracy 0.93 1250
macro avg 0.93 0.93 0.93 1250
weighted avg 0.93 0.93 0.93 1250blog
SVM支持向量機
precision recall f1-score supportci
0 1.00 1.00 1.00 634
1 1.00 1.00 1.00 616pandas
accuracy 1.00 1250
macro avg 1.00 1.00 1.00 1250
weighted avg 1.00 1.00 1.00 1250it