菜鳥學IT之python詞雲初體驗

時間 2020-06-04

標籤菜鳥 python 體驗欄目 Python 简体版

原文原文鏈接

做業來源：https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/2822python

1. 下載一長篇中文小說。git

2. 從文件讀取待分析文本。github

1 txt = open(r'G:\aa\三體.txt', 'r', encoding='utf8').read()  # 打開三體小說文件
2 jieba.load_userdict(r'G:\aa\three.txt')  # 讀取三體小說詞庫
3 
4 Filess= open(r'G:\aa\stops_chinese.txt', 'r', encoding='utf8')  # 打開中文停用詞表
5 stops = Filess.read().split('\n')  # 以回車鍵做爲標識符把停用詞表放到stops列表中

3. 安裝並使用jieba進行中文分詞。app

4. 更新詞庫，加入所分析對象的專業詞彙。less

首先下載你要搜索的txt文本
進入詞庫下載專業詞庫，參考詞庫下載地址：https://pinyin.sogou.com/dict/

 1 # -*- coding: utf-8 -*-
 2 import struct  3 import os  4  
 5 # 拼音表偏移，
 6 startPy = 0x1540;  7  
 8 # 漢語詞組表偏移
 9 startChinese = 0x2628;  10  
 11 # 全局拼音表
 12 GPy_Table = {}  13  
 14 # 解析結果
 15 # 元組(詞頻,拼音,中文詞組)的列表
 16  
 17  
 18 # 原始字節碼轉爲字符串
 19 def byte2str(data):  20     pos = 0  21     str = ''
 22     while pos < len(data):  23         c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])  24         if c != chr(0):  25             str += c  26         pos += 2
 27     return str  28  
 29 # 獲取拼音表
 30 def getPyTable(data):  31     data = data[4:]  32     pos = 0  33     while pos < len(data):  34         index = struct.unpack('H', bytes([data[pos],data[pos + 1]]))[0]  35         pos += 2
 36         lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  37         pos += 2
 38         py = byte2str(data[pos:pos + lenPy])  39  
 40         GPy_Table[index] = py  41         pos += lenPy  42  
 43 # 獲取一個詞組的拼音
 44 def getWordPy(data):  45     pos = 0  46     ret = ''
 47     while pos < len(data):  48         index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  49         ret += GPy_Table[index]  50         pos += 2
 51     return ret  52  
 53 # 讀取中文表
 54 def getChinese(data):  55     GTable = []  56     pos = 0  57     while pos < len(data):  58         # 同音詞數量
 59         same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  60  
 61         # 拼音索引表長度
 62         pos += 2
 63         py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  64  
 65         # 拼音索引表
 66         pos += 2
 67         py = getWordPy(data[pos: pos + py_table_len])  68  
 69         # 中文詞組
 70         pos += py_table_len  71         for i in range(same):  72             # 中文詞組長度
 73             c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  74             # 中文詞組
 75             pos += 2
 76             word = byte2str(data[pos: pos + c_len])  77             # 擴展數據長度
 78             pos += c_len  79             ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  80             # 詞頻
 81             pos += 2
 82             count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  83  
 84             # 保存
 85  GTable.append((count, py, word))  86  
 87             # 到下個詞的偏移位置
 88             pos += ext_len  89     return GTable  90  
 91  
 92 def scel2txt(file_name):  93     print('-' * 60)  94     with open(file_name, 'rb') as f:  95         data = f.read()  96  
 97     print("詞庫名：", byte2str(data[0x130:0x338])) # .encode('GB18030')
 98     print("詞庫類型：", byte2str(data[0x338:0x540]))  99     print("描述信息：", byte2str(data[0x540:0xd40])) 100     print("詞庫示例：", byte2str(data[0xd40:startPy])) 101  
102  getPyTable(data[startPy:startChinese]) 103  getChinese(data[startChinese:]) 104     return getChinese(data[startChinese:]) 105  
106 if __name__ == '__main__': 107     # scel所在文件夾路徑
108     in_path = r"C:\Users\Administrator\Downloads"   #修改成你的詞庫文件存放文件夾
109     # 輸出詞典所在文件夾路徑
110     out_path = r"C:\Users\Administrator\Downloads\text"  # 轉換以後文件存放文件夾
111     fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"] 112     for f in fin: 113         try: 114             for word in scel2txt(os.path.join(in_path, f)): 115                 file_path=(os.path.join(out_path, str(f).split('.')[0] + '.txt')) 116                 # 保存結果
117                 with open(file_path,'a+',encoding='utf-8')as file: 118                     file.write(word[2] + '\n') 119  os.remove(os.path.join(in_path, f)) 120         except Exception as e: 121             print(e) 122             pass

View Code

5. 生成詞頻統計ide

1 # 統計詞頻次數
2 for word in tokens: 3     if len(word) == 1: 4         continue
5     else: 6         wcdict[word] = wcdict.get(word, 0) + 1

View Code

6. 排序spa

1 # 詞頻排序
2 wcls = list(wcdict.items()) 3 wcls.sort(key=lambda x: x[1], reverse=True)

View Code

7. 排除語法型詞彙，代詞、冠詞、連詞等停用詞。code

1 Filess= open(r'G:\aa\stops_chinese.txt', 'r', encoding='utf8')  # 打開中文停用詞表
2 stops = Filess.read().split('\n')  # 以回車鍵做爲標識符把停用詞表放到stops列表中
3     
4 tokens=[token for token in wordsls if token not in stops] 5 print("過濾後中文內容對比:",len(tokens), len(wordsls))

View Code

8. 輸出詞頻最大TOP20，把結果存放到文件裏對象

1 # 打印前25詞頻最高的中文
2 for i in range(25): 3     print(wcls[i]) 4 
5 # 存儲過濾後的文本
6 pd.DataFrame(wcls).to_csv('three.csv', encoding='utf-8') 7 
8 # 讀取csv詞雲
9 txt = open('three.csv', 'r', encoding='utf-8').read()

View Code

9. 生成詞雲。blog

 1 # 讀取csv詞雲
 2 txt = open('three.csv', 'r', encoding='utf-8').read()  3 
 4 # 用空格鍵隔開文本並把它弄進列表中
 5 cut_text = "".join(jieba.lcut(txt))  6 mywc = WordCloud().generate(cut_text)  7 
 8 plt.imshow(mywc)  9 plt.axis("off") 10 plt.show()