中文詞頻統計與詞雲生成

做業來源:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/2822html

中文詞頻統計url

1. 下載一長篇中文小說。spa

2. 從文件讀取待分析文本。code

3. 安裝並使用jieba進行中文分詞。xml

pip install jiebahtm

import jieba對象

jieba.lcut(text)blog

4. 更新詞庫,加入所分析對象的專業詞彙。排序

jieba.add_word('天罡北斗陣')  #逐個添加token

jieba.load_userdict(word_dict)  #詞庫文本文件

參考詞庫下載地址:https://pinyin.sogou.com/dict/

轉換代碼:scel_to_text

import chardet
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from bs4 import BeautifulSoup
import requests

def get_text(url):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    response=requests.get(url,headers=headers)
    response.encoding = chardet.detect(response.content)['encoding']
    soup=BeautifulSoup(response.text,'lxml')
    a=soup.find(id='content').get_text().replace('\u3000','').splitlines()[1:-3]
    a = ''.join(a)
    o = '!。,?'
    for n in o:
        a =a.replace(n,'')
    return a

def save_file(a):
    with open('F:\pyprogram\out.text','w',encoding='utf-8') as flie:
        flie.write(a)

def jb(a):
    jieba.add_word('來了')
    jieba.load_userdict('F:\pyprogram\dictionary.txt')
    b=jieba.lcut(a)
    with open('F:\pyprogram\stops.txt','r',encoding='utf8') as fa:
        stops=fa.read()
    tokens = [token for token in b if token not in stops]
    bookdick={}
    for i in tokens:
        bookdick[i]=b.count(i)
    dictionary=list(bookdick.items())
    dictionary.sort(key=lambda x: x[1], reverse=True)
    for p in range(20):
        print(dictionary[p])
    return tokens

5. 生成詞頻統計

6. 排序

7. 排除語法型詞彙,代詞、冠詞、連詞等停用詞。

stops

tokens=[token for token in wordsls if token not in stops]

8. 輸出詞頻最大TOP20,把結果存放到文件裏

9. 生成詞雲。

import chardet
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from bs4 import BeautifulSoup
import requests

def get_text(url):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    response=requests.get(url,headers=headers)
    response.encoding = chardet.detect(response.content)['encoding']
    soup=BeautifulSoup(response.text,'lxml')
    a=soup.find(id='content').get_text().replace('\u3000','').splitlines()[1:-3]
    a = ''.join(a)
    o = '!。,?'
    for n in o:
        a =a.replace(n,'')
    return a

def save_file(a):
    with open('F:\pyprogram\out.text','w',encoding='utf-8') as flie:
        flie.write(a)

def jb(a):
    jieba.add_word('來了')
    jieba.load_userdict('F:\pyprogram\dictionary.txt')
    b=jieba.lcut(a)
    with open('F:\pyprogram\stops.txt','r',encoding='utf8') as fa:
        stops=fa.read()
    tokens = [token for token in b if token not in stops]
    bookdick={}
    for i in tokens:
        bookdick[i]=b.count(i)
    dictionary=list(bookdick.items())
    dictionary.sort(key=lambda x: x[1], reverse=True)
    for p in range(20):
        print(dictionary[p])
    return tokens

def wc(tokens):
    wl_split=' '.join(tokens)
    mywc=WordCloud(background_color = '#36f',width=400,height=300,margin = 1).generate(wl_split)
    plt.imshow(mywc)
    plt.axis('off')
    plt.show()

if __name__=='__main__':
    url = 'http://www.changpianxiaoshuo.com/jingpinwenzhang-commend/youlangchumo.html'
    a=get_text(url)
    tokens=jb(a)
    wc(tokens)

相關文章
相關標籤/搜索