數據來自UCI機器學習倉庫中的垃圾信息數據集app
數據可從http://archive.ics.uci.edu/ml/datasets/sms+spam+collection下載機器學習
轉成csv載入數據性能
import matplotlib matplotlib.rcParams['font.sans-serif']=[u'simHei'] matplotlib.rcParams['axes.unicode_minus']=False import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split,cross_val_score df = pd.read_csv('data/SMSSpamCollection.csv',header=None) print(df.head) print("垃圾郵件個數:%s" % df[df[0]=='spam'][0].count()) print("正常郵件個數:%s" % df[df[0]=='ham'][0].count())
垃圾郵件個數:747
正常郵件個數:4825
建立TfidfVectorizer實例,將訓練文本和測試文本都進行轉換學習
X = df[1].values y = df[0].values X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw)
創建邏輯迴歸模型訓練和預測測試
LR = LogisticRegression() LR.fit(X_train,y_train) predictions = LR.predict(X_test) for i,prediction in enumerate(predictions[:5]): print("預測爲 %s ,信件爲 %s" % (prediction,X_test_raw[i]))
預測爲 ham ,信件爲 Send to someone else :-) 預測爲 ham ,信件爲 Easy ah?sen got selected means its good.. 預測爲 ham ,信件爲 Sorry da. I gone mad so many pending works what to do. 預測爲 ham ,信件爲 What not under standing. 預測爲 spam ,信件爲 SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info
二元分類性能指標:混淆矩陣spa
# In[2]二元分類分類指標 from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt # predictions 與 y_test confusion_matrix = confusion_matrix(y_test,predictions) print(confusion_matrix) plt.matshow(confusion_matrix) plt.title("混淆矩陣") plt.colorbar() plt.ylabel("真實值") plt.xlabel("預測值") plt.show()
[[1217 1]
[ 52 123]]
準確率,召回率,精準率,F1值3d
# In[3] 給出 precision recall f1-score support from sklearn.metrics import classification_report print(classification_report(y_test,predictions)) from sklearn.metrics import roc_curve,auc # 準確率 scores = cross_val_score(LR,X_train,y_train,cv=5) print("準確率爲: ",scores) print("平均準確率爲: ",np.mean(scores)) # 有時必需要將標籤轉爲數值 from sklearn.preprocessing import LabelEncoder class_le = LabelEncoder() y_train_n = class_le.fit_transform(y_train) y_test_n = class_le.fit_transform(y_test) # 精準率 precision = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision') print("平均精準率爲: ",np.mean(precision)) # 召回率 recall = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall') print("平均召回率爲: ",np.mean(recall)) # F1值 f1 = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1') print("平均F1值爲: ",np.mean(f1))
準確率爲: [0.96654719 0.95459976 0.95449102 0.9508982 0.96047904] 平均準確率爲: 0.9574030433756144 平均精準率爲: 0.9906631114805584 平均召回率爲: 0.6956979405034325 平均F1值爲: 0.8162874707978786
畫出ROC曲線,AUC爲ROC曲線如下部分的面積code
# In[4] ROC曲線 y_test_n爲數值 predictions_pro = LR.predict_proba(X_test) false_positive_rate, recall, thresholds = roc_curve(y_test_n,predictions_pro[:,1]) roc_auc = auc(false_positive_rate, recall) plt.title("受試者操做特徵曲線(ROC)") plt.plot(false_positive_rate, recall, 'b', label='AUC = % 0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('假陽性率') plt.ylabel('召回率') plt.show()
全部代碼:orm
# -*- coding: utf-8 -*- import matplotlib matplotlib.rcParams['font.sans-serif']=[u'simHei'] matplotlib.rcParams['axes.unicode_minus']=False import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split,cross_val_score df = pd.read_csv('data/SMSSpamCollection.csv',header=None) print(df.head) print("垃圾郵件個數:%s" % df[df[0]=='spam'][0].count()) print("正常郵件個數:%s" % df[df[0]=='ham'][0].count()) # In[1] X = df[1].values y = df[0].values X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) LR = LogisticRegression() LR.fit(X_train,y_train) predictions = LR.predict(X_test) for i,prediction in enumerate(predictions[:5]): print("預測爲 %s ,信件爲 %s" % (prediction,X_test_raw[i])) # In[2]二元分類分類指標 from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt # predictions 與 y_test confusion_matrix = confusion_matrix(y_test,predictions) print(confusion_matrix) plt.matshow(confusion_matrix) plt.title("混淆矩陣") plt.colorbar() plt.ylabel("真實值") plt.xlabel("預測值") plt.show() # In[3] 給出 precision recall f1-score support from sklearn.metrics import classification_report print(classification_report(y_test,predictions)) from sklearn.metrics import roc_curve,auc # 準確率 scores = cross_val_score(LR,X_train,y_train,cv=5) print("準確率爲: ",scores) print("平均準確率爲: ",np.mean(scores)) # 必需要將標籤轉爲數值 from sklearn.preprocessing import LabelEncoder class_le = LabelEncoder() y_train_n = class_le.fit_transform(y_train) y_test_n = class_le.fit_transform(y_test) # 精準率 precision = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision') print("平均精準率爲: ",np.mean(precision)) # 召回率 recall = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall') print("平均召回率爲: ",np.mean(recall)) # F1值 f1 = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1') print("平均F1值爲: ",np.mean(f1)) # In[4] ROC曲線 y_test_n爲數值 predictions_pro = LR.predict_proba(X_test) false_positive_rate, recall, thresholds = roc_curve(y_test_n,predictions_pro[:,1]) roc_auc = auc(false_positive_rate, recall) plt.title("受試者操做特徵曲線(ROC)") plt.plot(false_positive_rate, recall, 'b', label='AUC = % 0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('假陽性率') plt.ylabel('召回率') plt.show()