中文文本分類

        將文本進行分類是自然語言處理當中最主要的工作之一,本文處理很重要的一項工作就是對文本進行向量化,本文不做詳細的介紹,只是採用TF-IDF的方法對文本進行向量化,然後分別採用SVM,  Bayes,  RandomForest,BP四種方法對文本進行分類。

訓練語料是在excel中存儲的,格式見下圖:

data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)

# 加載自定義詞典
jieba.load_userdict('../dict_out.csv')
# 加載停用詞表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
    try:
        jiebas = jieba.cut(i)
        jiebas = [w for w in jiebas if w not in stopkey]
        fenci_key = ",".join(jiebas)
    except AttributeError:
        continue
    finally:
        list2.append(jiebas)
        list1.append(fenci_key.strip())
# 將分分詞結果寫入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)

該文本訓練庫共有10000條數據,分爲:'體育', '娛樂', '家居', '房產', '教育', '時尚', '時政', '遊戲', '科技', '財經'這10個類別。

本文的分類主要工程如下:

  1. 對文本內容進行分詞處理,刪除停用詞,只留下有意義的詞語。
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)

# 加載自定義詞典
jieba.load_userdict('../dict_out.csv')
# 加載停用詞表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
    try:
        jiebas = jieba.cut(i)
        jiebas = [w for w in jiebas if w not in stopkey]
        fenci_key = ",".join(jiebas)
    except AttributeError:
        continue
    finally:
        list2.append(jiebas)
        list1.append(fenci_key.strip())
# 將分分詞結果寫入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)

2.將語料庫分爲訓練集和測試集

data = pd.read_excel('1data.xls', encoding='utf-8', header=None)
data.columns=[ 'class_label','text', 'tokens']
label = data['class_label']
categories = []
for i in label:
    if i in categories:
        pass
    else:
        categories.append(i)
print(categories)

le = preprocessing.LabelEncoder().fit_transform(data['class_label'])
data["class_label"] = le
# 劃分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], 
                                                    data["class_label"], 
                                                    test_size=0.2,
                                                    random_state=1)

3.對詞組進行TF-IDF處理,將各個詞組轉換成詞向量。具體理論可查看其他相關資料,這裏不再做詳細的闡述

# 聲明文本特徵提取方法
# 文本特徵提取
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

4.分別採用上面提到的分類方法進行訓練和測試,並查看測試結果

def get_metrics(y_test, y_predicted):
    """
    y_test:真實值
    y_predicted:預測值
    """
    # 精確度=真陽性/(真陽性+假陽性)
    precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')
    # 召回率=真陽性/(真陽性+假陰性)
    recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
    # F1
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    # 準確率
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1
def BayesClassify():
    clf_tfidf = MultinomialNB(alpha=0.01)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "BayesModel.m")

def BayesTest():
    clf_tfidf = joblib.load("BayesModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 評估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def SVMClassify():
    clf_tfidf = SVC(gamma=1, kernel='rbf', probability=True)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "SVMModel.m")

def SVMTest():
    clf_tfidf = joblib.load("SVMModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 評估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def RandomForestClassify():
    clf_tfidf = clf_tfidf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "RandomForestModel.m")

def RandomForestTest():
    clf_tfidf = joblib.load("RandomForestModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 評估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def BPClassify(inputPoint):
    net = Sequential()
    net.add(Dense(128, input_shape=(inputPoint,)))
    net.add(Activation('relu'))
    
    net.add(Dense(len(categories), input_shape=(128,)))
    net.add(Activation('sigmoid'))
    
    net.compile(optimizer='adam', loss='binary_crossentropy')
    net.fit(X_train_tfidf, y_train_onehot, batch_size=128, epochs=2)


    y_predicted_tfidf = net.predict(X_test_tfidf)
    print(y_predicted_tfidf)
    res = np.zeros((y_test.shape[0], 1))
    for i, j in enumerate(y_predicted_tfidf):
        j = list(j)
        maxIndex = j.index(max(j))
        res[i] = maxIndex
    y_predicted_tfidf = res
    
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 評估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)

最終的分類效果較爲理想,準確率和召回率都在90%以上。其中SVM耗時稍長。

文本分類    svm  貝葉斯  隨機森林  神經網絡