import wordcloud import pandas as pd import jieba import matplotlib.pyplot as plt from nltk.corpus import brown font_path = 'C:\Windows\Fonts\msyh.ttc' text = 'this is shanghai, 李帥, 郭靖, 成龍, 哀牢山 三十六劍' ''' wc = wordcloud.WordCloud( font_path=font_path, max_font_size=300, width=360, height=180, mode='RGBA', background_color=None, #透明的詞雲 ) cloudobj = wc.generate(text) # cloudobj.show() print(cloudobj) # 展現詞雲圖 plt.imshow(cloudobj) # 關閉座標軸,不然很醜 plt.axis('off') plt.show() # 保存高清圖片 cloudobj.to_file('詞雲.png') ''' raw = pd.read_table('./金庸-射鵰英雄傳txt精校版.txt',names=['txt'],encoding='GBK') # print(raw) # 加入章節標識 # 章節判斷用變量預處理 def m_head(tmpstr): return tmpstr[:1] #取第一個字 def m_mid(tmpstr): return tmpstr.find("回 ") # 用apply函數將下面的屬性加入到對應列 raw['head'] = raw.txt.apply(m_head) raw['mid'] = raw.txt.apply(m_mid) raw['len'] = raw.txt.apply(len) # 章節判斷 chapnum = 0 for i in range(len(raw)): if raw['head'][i] == "第" and raw['mid'][i] >0 and raw['len'][i]<30: chapnum += 1 if chapnum >= 40 and raw['txt'][i] == "附錄一:成吉思汗家族": chapnum=0 raw.loc[i,'chap'] = chapnum # 刪除臨時變量 del raw['head'] del raw['mid'] del raw['len'] # 段落聚合 根據章節聚合 rawgrp = raw.groupby('chap') chapter = rawgrp.agg(sum) chapter = chapter[chapter.index != 0] t = chapter.txt[1] print("*"*100) print(t) print("*"*100) ''' 生成射鵰英雄傳第一章的詞雲 ''' # 把停用詞.txt的內容讀入數據框 stoplist = list(pd.read_csv('./停用詞.txt',names=['w'],sep='aaa',encoding='utf-8',engine='python').w) # print(stoplist) # print(' '.join(stoplist)) def m_cut(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist] cloudobj = wordcloud.WordCloud( font_path=font_path, width=1200, height=800, mode='RGBA', background_color=None, stopwords=stoplist, ).generate(' '.join(jieba.lcut(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis('off') plt.show() ''' 基於分詞頻數繪製詞雲 ''' txt_freq = {'張三':100,'李四':90,'王二麻子':50} cloudobj = wordcloud.WordCloud( font_path=font_path, ).fit_words(txt_freq) plt.imshow(cloudobj) plt.axis("off") plt.show() ''' 基於分詞頻數繪製射鵰英雄傳的詞雲 ''' import nltk from nltk import FreqDist # 去停用詞 tokens = m_cut(chapter.txt[1]) # 生成完備的詞條頻數詞典 fdist = FreqDist(tokens) print(type(fdist)) # <class 'nltk.probability.FreqDist'> cloudobj = wordcloud.WordCloud( font_path=font_path, background_color=None, width=1600, height=1000, ).fit_words(fdist) plt.imshow(cloudobj) plt.axis("off") plt.show() ''' 詞雲的美化: 1,設置背景圖片 Mask/遮罩 用於控制詞頻的總體形狀 指定mask後,設置的高和寬江北忽略,遮罩形狀被指定圖形的形狀取代。除全白的部分仍然被保留外, 其他部分會用於繪製詞雲。所以背景圖片的畫布必定要設置爲白色 字體的大小,佈局和顏色也會基於mask生成 必要時須要調整顏色以加強可視效果 # 基本調用方式 from scipy.misc import imread mask = imread('背景圖片') ''' from imageio import imread def m_cut2(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1] cloudobj = wordcloud.WordCloud( font_path=font_path, mask=imread('射鵰背景1.png'), mode='RGBA', background_color=None, ).generate(' '.join(m_cut2(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis("off") plt.show()