將文本進行分類是自然語言處理當中最主要的工作之一,本文處理很重要的一項工作就是對文本進行向量化,本文不做詳細的介紹,只是採用TF-IDF的方法對文本進行向量化,然後分別採用SVM, Bayes, RandomForest,BP四種方法對文本進行分類。
訓練語料是在excel中存儲的,格式見下圖:
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None) data.columns = ['class_label', 'text'] data.dropna(inplace=True) # 加載自定義詞典 jieba.load_userdict('../dict_out.csv') # 加載停用詞表 stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()] stopkey.append(" ") list1 = [] list2 = [] for i in data["text"]: try: jiebas = jieba.cut(i) jiebas = [w for w in jiebas if w not in stopkey] fenci_key = ",".join(jiebas) except AttributeError: continue finally: list2.append(jiebas) list1.append(fenci_key.strip()) # 將分分詞結果寫入data data["tokens"] = list1 data.to_excel("1data.xls", header=None, index=False)
該文本訓練庫共有10000條數據,分爲:'體育', '娛樂', '家居', '房產', '教育', '時尚', '時政', '遊戲', '科技', '財經'這10個類別。
本文的分類主要工程如下:
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None) data.columns = ['class_label', 'text'] data.dropna(inplace=True) # 加載自定義詞典 jieba.load_userdict('../dict_out.csv') # 加載停用詞表 stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()] stopkey.append(" ") list1 = [] list2 = [] for i in data["text"]: try: jiebas = jieba.cut(i) jiebas = [w for w in jiebas if w not in stopkey] fenci_key = ",".join(jiebas) except AttributeError: continue finally: list2.append(jiebas) list1.append(fenci_key.strip()) # 將分分詞結果寫入data data["tokens"] = list1 data.to_excel("1data.xls", header=None, index=False)
2.將語料庫分爲訓練集和測試集
data = pd.read_excel('1data.xls', encoding='utf-8', header=None) data.columns=[ 'class_label','text', 'tokens'] label = data['class_label'] categories = [] for i in label: if i in categories: pass else: categories.append(i) print(categories) le = preprocessing.LabelEncoder().fit_transform(data['class_label']) data["class_label"] = le # 劃分訓練集和測試集 X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["class_label"], test_size=0.2, random_state=1)
3.對詞組進行TF-IDF處理,將各個詞組轉換成詞向量。具體理論可查看其他相關資料,這裏不再做詳細的闡述
# 聲明文本特徵提取方法 # 文本特徵提取 X_train_tfidf, tfidf_vectorizer = tfidf(X_train) X_test_tfidf = tfidf_vectorizer.transform(X_test) def tfidf(data): tfidf_vectorizer = TfidfVectorizer() train = tfidf_vectorizer.fit_transform(data) return train, tfidf_vectorizer
4.分別採用上面提到的分類方法進行訓練和測試,並查看測試結果
def get_metrics(y_test, y_predicted): """ y_test:真實值 y_predicted:預測值 """ # 精確度=真陽性/(真陽性+假陽性) precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted') # 召回率=真陽性/(真陽性+假陰性) recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted') # F1 f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted') # 準確率 accuracy = accuracy_score(y_test, y_predicted) return accuracy, precision, recall, f1 def BayesClassify(): clf_tfidf = MultinomialNB(alpha=0.01) clf_tfidf.fit(X_train_tfidf, y_train) joblib.dump(clf_tfidf, "BayesModel.m") def BayesTest(): clf_tfidf = joblib.load("BayesModel.m") y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf) accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf) print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % ( accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf)) # 評估 print("Precision, Recall, F1-Score and support") print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories)) # 混淆矩陣 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_predicted_tfidf) print(cm) def SVMClassify(): clf_tfidf = SVC(gamma=1, kernel='rbf', probability=True) clf_tfidf.fit(X_train_tfidf, y_train) joblib.dump(clf_tfidf, "SVMModel.m") def SVMTest(): clf_tfidf = joblib.load("SVMModel.m") y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf) accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf) print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % ( accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf)) # 評估 print("Precision, Recall, F1-Score and support") print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories)) # 混淆矩陣 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_predicted_tfidf) print(cm) def RandomForestClassify(): clf_tfidf = clf_tfidf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0) clf_tfidf.fit(X_train_tfidf, y_train) joblib.dump(clf_tfidf, "RandomForestModel.m") def RandomForestTest(): clf_tfidf = joblib.load("RandomForestModel.m") y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf) accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf) print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % ( accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf)) # 評估 print("Precision, Recall, F1-Score and support") print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories)) # 混淆矩陣 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_predicted_tfidf) print(cm) def BPClassify(inputPoint): net = Sequential() net.add(Dense(128, input_shape=(inputPoint,))) net.add(Activation('relu')) net.add(Dense(len(categories), input_shape=(128,))) net.add(Activation('sigmoid')) net.compile(optimizer='adam', loss='binary_crossentropy') net.fit(X_train_tfidf, y_train_onehot, batch_size=128, epochs=2) y_predicted_tfidf = net.predict(X_test_tfidf) print(y_predicted_tfidf) res = np.zeros((y_test.shape[0], 1)) for i, j in enumerate(y_predicted_tfidf): j = list(j) maxIndex = j.index(max(j)) res[i] = maxIndex y_predicted_tfidf = res accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf) print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % ( accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf)) # 評估 print("Precision, Recall, F1-Score and support") print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories)) # 混淆矩陣 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_predicted_tfidf) print(cm)
最終的分類效果較爲理想,準確率和召回率都在90%以上。其中SVM耗時稍長。
文本分類 svm 貝葉斯 隨機森林 神經網絡