使用98年人民日報語料庫進行中文分詞訓練及測試。python
98年人民日報語料庫(1998-01-105-帶音.txt),用80%的數據做爲訓練集,20%的數據做爲驗證集。git
Jupyter Notebook, Python3github
實現了前向匹配算法的分詞功能。算法
https://github.com/YanqiangWang/NLP-Summer-Courseapp
1.處理語料庫: 刪除段前標號,以及詞性標註。dom
# 讀取原始語料文件 in_path = '1998-01-105-帶音.txt' file = open(in_path, encoding='gbk') in_data = file.readlines()
# 預處理後的語料庫 curpus_path = 'curpus.txt' curpusfile = open(curpus_path, 'w', encoding='utf-8')
#刪除段前標號,[],{},詞性標註(最短匹配) for sentence in in_data: words = sentence.strip().split(' ') words.pop(0) for word in words: if word.strip() != '': if word.startswith('['): word = word[1:] elif ']' in word: word = word[0:word.index(']')] if '{' in word: word = word[0:word.index('{')] w_c = word.split('/') # 生成語料庫 curpusfile.write(w_c[0] + ' ') curpusfile.write('\n')
2.隨機劃分訓練集80%和驗證集20%。性能
from sklearn.model_selection import train_test_split # 隨機劃分 curpus = open(curpus_path, encoding='utf-8').readlines() train_data, test_data = train_test_split( curpus, test_size=0.2, random_state=10)
# 查看劃分後的數據大小 print(len(curpus)) print(len(train_data) / len(curpus)) print(len(test_data) / len(curpus))
22787 0.7999736691973494 0.20002633080265064
3.前向匹配算法FMM的實現。測試
# 生成詞典 from tqdm import tqdm_notebook dic = [] for sentence in tqdm_notebook(train_data): words = sentence.strip().split(' ') for word in words: if word.strip() != '': if word not in dic: dic.append(word)
# 設置單詞最大長度 max_dic_len = 5
# 生成分詞測試文本 test_text = [] for sentence in test_data: words = sentence.strip().split(' ') test_text.append(''.join(words))
# 保存驗證集 test_path = 'test.txt' testfile = open(test_path, 'w', encoding='utf-8') for sentence in test_data: testfile.write(sentence)
# 保存分詞結果 result_path = 'result.txt' resultfile = open(result_path, 'w', encoding='utf-8')
# 前向匹配 for sentence in tqdm_notebook(test_text): sent = sentence words = [] max_len = max_dic_len while(len(sent) > 0): word_len = max_len for i in range(0, max_len): word = sent[0:word_len] if word_len == 1 or word in dic: sent = sent[word_len:] words.append(word) word = [] break else: word_len -= 1 word = [] resultfile.write(' '.join(words) + '\n')
查準率,查全率,F度量code
Precision = (Number of words correctly segmented) / (Number of words segmented) * 100%ip
Recall = (Number of words correctly segmented) / (Number of words in the reference) * 100%
F measure = 2 * P * R / (P + R)
def get_word(path): f = open(path, 'r', encoding='utf-8') lines = f.readlines() return lines result_lines = get_word(result_path) test_lines = get_word(test_path) list_num = len(test_lines) if len(test_lines) < len(result_lines) else len(result_lines) right_num = 0 result_cnt = 0 test_cnt = 0 for i in tqdm_notebook(range(list_num)): result_sent = list(result_lines[i].split()) test_sent = list(test_lines[i].split()) result_cnt += len(result_sent) test_cnt += len(test_sent) str_result = '' str_test = '' i_result = 0 i_test = 0 while i_result < len(result_sent) and i_test < len(test_sent): word_result = result_sent[i_result] word_test = test_sent[i_test] str_result += word_result str_test += word_test if word_result == word_test: right_num += 1 i_result += 1 i_test += 1 else: while len(str_result) > len(str_test): i_test += 1 if i_test >= len(test_sent): break str_test += test_sent[i_test] while len(str_result) < len(str_test): i_result += 1 if i_result >= len(result_sent): break str_result += result_sent[i_result] i_test += 1 i_result += 1
print("生成結果詞的個數:", result_cnt) print("驗證集結果詞個數:", test_cnt) p = right_num / result_cnt r = right_num / test_cnt f = 2 * p * r / (p + r) print("查準率:", p) print("查全率:", r) print("F度量:", f)
生成結果詞的個數: 227640 驗證集結果詞個數: 219680 查準率: 0.8301748374626603 查全率: 0.8602558266569555 F度量: 0.8449476884556917