WordCloud詞雲製做以及美化

相關截圖

 

 

 

 

 

練習代碼

import wordcloud
import pandas as pd
import jieba
import matplotlib.pyplot as plt
from nltk.corpus import brown
font_path = 'C:\Windows\Fonts\msyh.ttc'
text = 'this is shanghai, 李帥, 郭靖, 成龍, 哀牢山 三十六劍'

'''
wc = wordcloud.WordCloud(
    font_path=font_path,
    max_font_size=300,
    width=360,
    height=180,
    mode='RGBA',
    background_color=None, #透明的詞雲
)

cloudobj = wc.generate(text)

# cloudobj.show()
print(cloudobj)

#     展現詞雲圖
plt.imshow(cloudobj)
# 關閉座標軸,不然很醜
plt.axis('off')
plt.show()
# 保存高清圖片
cloudobj.to_file('詞雲.png')
'''
raw = pd.read_table('./金庸-射鵰英雄傳txt精校版.txt',names=['txt'],encoding='GBK')
# print(raw)

# 加入章節標識
# 章節判斷用變量預處理
def m_head(tmpstr):
    return tmpstr[:1]  #取第一個字

def m_mid(tmpstr):
    return tmpstr.find("")

# 用apply函數將下面的屬性加入到對應列
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)


# 章節判斷
chapnum = 0
for i in range(len(raw)):
    if raw['head'][i] == "" and raw['mid'][i] >0 and raw['len'][i]<30:
        chapnum += 1
    if chapnum >= 40 and raw['txt'][i] == "附錄一:成吉思汗家族":
        chapnum=0
    raw.loc[i,'chap'] = chapnum


# 刪除臨時變量
del raw['head']
del raw['mid']
del raw['len']

# 段落聚合  根據章節聚合
rawgrp = raw.groupby('chap')

chapter = rawgrp.agg(sum)

chapter = chapter[chapter.index != 0]

t = chapter.txt[1]
print("*"*100)
print(t)
print("*"*100)
'''
生成射鵰英雄傳第一章的詞雲
'''


# 把停用詞.txt的內容讀入數據框
stoplist = list(pd.read_csv('./停用詞.txt',names=['w'],sep='aaa',encoding='utf-8',engine='python').w)


# print(stoplist)
# print(' '.join(stoplist))

def m_cut(intxt):
    return [w for w in jieba.cut(intxt) if w not in stoplist]


cloudobj = wordcloud.WordCloud(
    font_path=font_path,
    width=1200,
    height=800,
    mode='RGBA',
    background_color=None,
    stopwords=stoplist,
).generate(' '.join(jieba.lcut(chapter.txt[1])))

plt.imshow(cloudobj)
plt.axis('off')
plt.show()



'''
基於分詞頻數繪製詞雲
'''

txt_freq = {'張三':100,'李四':90,'王二麻子':50}

cloudobj = wordcloud.WordCloud(
    font_path=font_path,

).fit_words(txt_freq)

plt.imshow(cloudobj)
plt.axis("off")
plt.show()


'''
基於分詞頻數繪製射鵰英雄傳的詞雲
'''
import nltk
from nltk import FreqDist

# 去停用詞
tokens = m_cut(chapter.txt[1])
# 生成完備的詞條頻數詞典
fdist = FreqDist(tokens)

print(type(fdist))
# <class 'nltk.probability.FreqDist'>

cloudobj = wordcloud.WordCloud(
    font_path=font_path,
    background_color=None,
    width=1600,
    height=1000,

).fit_words(fdist)

plt.imshow(cloudobj)
plt.axis("off")
plt.show()



'''
詞雲的美化:
1,設置背景圖片
    Mask/遮罩
    用於控制詞頻的總體形狀
    指定mask後,設置的高和寬江北忽略,遮罩形狀被指定圖形的形狀取代。除全白的部分仍然被保留外,
    其他部分會用於繪製詞雲。所以背景圖片的畫布必定要設置爲白色
    字體的大小,佈局和顏色也會基於mask生成
    必要時須要調整顏色以加強可視效果

    # 基本調用方式
    from scipy.misc import imread
    mask = imread('背景圖片')
'''


from imageio import imread

def m_cut2(intxt):
    return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1]

cloudobj = wordcloud.WordCloud(
    font_path=font_path,
    mask=imread('射鵰背景1.png'),
    mode='RGBA',
    background_color=None,

).generate(' '.join(m_cut2(chapter.txt[1])))


plt.imshow(cloudobj)
plt.axis("off")
plt.show()
相關文章
相關標籤/搜索