做業來源:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/2822html
中文詞頻統計url
1. 下載一長篇中文小說。spa
2. 從文件讀取待分析文本。code
3. 安裝並使用jieba進行中文分詞。xml
pip install jiebahtm
import jieba對象
jieba.lcut(text)blog
4. 更新詞庫,加入所分析對象的專業詞彙。排序
jieba.add_word('天罡北斗陣') #逐個添加token
jieba.load_userdict(word_dict) #詞庫文本文件
參考詞庫下載地址:https://pinyin.sogou.com/dict/
轉換代碼:scel_to_text
import chardet from wordcloud import WordCloud import matplotlib.pyplot as plt import jieba from bs4 import BeautifulSoup import requests def get_text(url): headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} response=requests.get(url,headers=headers) response.encoding = chardet.detect(response.content)['encoding'] soup=BeautifulSoup(response.text,'lxml') a=soup.find(id='content').get_text().replace('\u3000','').splitlines()[1:-3] a = ''.join(a) o = '!。,?' for n in o: a =a.replace(n,'') return a def save_file(a): with open('F:\pyprogram\out.text','w',encoding='utf-8') as flie: flie.write(a) def jb(a): jieba.add_word('來了') jieba.load_userdict('F:\pyprogram\dictionary.txt') b=jieba.lcut(a) with open('F:\pyprogram\stops.txt','r',encoding='utf8') as fa: stops=fa.read() tokens = [token for token in b if token not in stops] bookdick={} for i in tokens: bookdick[i]=b.count(i) dictionary=list(bookdick.items()) dictionary.sort(key=lambda x: x[1], reverse=True) for p in range(20): print(dictionary[p]) return tokens
5. 生成詞頻統計
6. 排序
7. 排除語法型詞彙,代詞、冠詞、連詞等停用詞。
stops
tokens=[token for token in wordsls if token not in stops]
8. 輸出詞頻最大TOP20,把結果存放到文件裏
9. 生成詞雲。
import chardet from wordcloud import WordCloud import matplotlib.pyplot as plt import jieba from bs4 import BeautifulSoup import requests def get_text(url): headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} response=requests.get(url,headers=headers) response.encoding = chardet.detect(response.content)['encoding'] soup=BeautifulSoup(response.text,'lxml') a=soup.find(id='content').get_text().replace('\u3000','').splitlines()[1:-3] a = ''.join(a) o = '!。,?' for n in o: a =a.replace(n,'') return a def save_file(a): with open('F:\pyprogram\out.text','w',encoding='utf-8') as flie: flie.write(a) def jb(a): jieba.add_word('來了') jieba.load_userdict('F:\pyprogram\dictionary.txt') b=jieba.lcut(a) with open('F:\pyprogram\stops.txt','r',encoding='utf8') as fa: stops=fa.read() tokens = [token for token in b if token not in stops] bookdick={} for i in tokens: bookdick[i]=b.count(i) dictionary=list(bookdick.items()) dictionary.sort(key=lambda x: x[1], reverse=True) for p in range(20): print(dictionary[p]) return tokens def wc(tokens): wl_split=' '.join(tokens) mywc=WordCloud(background_color = '#36f',width=400,height=300,margin = 1).generate(wl_split) plt.imshow(mywc) plt.axis('off') plt.show() if __name__=='__main__': url = 'http://www.changpianxiaoshuo.com/jingpinwenzhang-commend/youlangchumo.html' a=get_text(url) tokens=jb(a) wc(tokens)