啓動命令行,輸入:pip install wordcloud
wordcloud庫,能夠說是python很是優秀的詞雲展現第三方庫。詞雲以詞語爲基本單位更加直觀和藝術的展現文本 詞雲圖,也叫文字雲,是對文本中出現頻率較高的「關鍵詞」予以視覺化的展示,詞雲圖過濾掉大量的低頻低質的文本信息,使得瀏覽者只要一眼掃過文本就可領略文本的主旨。 基於Python的詞雲生成類庫,很好用,並且功能強大。在作統計分析的時候有着很好的應用,比較推薦。
官方地址:https://amueller.github.io/word_cloud/html
#導入所需庫 from wordcloud import WordCloud f = open(r'C:\Users\JluTIger\Desktop\texten.txt','r').read() wordcloud = WordCloud(background_color="white", width=1000, height=860, margin=2).generate(f) # width,height,margin能夠設置圖片屬性 # generate 能夠對所有文本進行自動分詞,可是對中文支持很差 # 能夠設置font_path參數來設置字體集 添加一箇中文字體文件,通常是.ttf或.otf格式 #background_color參數爲設置背景顏色,默認顏色爲黑色 import matplotlib.pyplot as plt plt.imshow(wordcloud) plt.axis("off")#不顯示座標軸 plt.show()#顯示圖片 wordcloud.to_file('test.png')#保存圖片 # 保存圖片,可是在第三模塊的例子中 圖片大小將會按照 mask 保存
from wordcloud import WordCloud fontpath='SourceHanSansCN-Regular.otf' wc = WordCloud(font_path=fontpath, # 設置字體 background_color="white", # 背景顏色 max_words=1000, # 詞雲顯示的最大詞數 max_font_size=500, # 字體最大值 min_font_size=20, #字體最小值 random_state=42, #隨機數 collocations=False, #避免重複單詞 width=1600,height=1200,margin=10, #圖像寬高,字間距,須要配合下面的plt.figure(dpi=xx)放縮纔有效 ) wc.generate(cuted)
import jieba cut = jieba.cut(text) #text爲你須要分詞的字符串/句子 string = ' '.join(cut) #將分開的詞用空格鏈接 print(string)
Building prefix dict from the default dictionary ... Loading model from cache C:\Users\mengx7\AppData\Local\Temp\jieba.cache 這是 一個 簡單 的 例子 Loading model cost 0.978 seconds. Prefix dict has been built succesfully.
去除冗餘單詞python
import jieba removes =['熟悉', '技術', '職位', '相關', '工做', '開發', '使用','能力','優先','描述','任職'] for w in removes: jieba.del_word(w) words = jieba.lcut(text) cuted = ' '.join(words) print(cuted[:100]) 或者 words = jieba.lcut(text) words = [w for w in words if w not in removes]
區分中英文git
若是咱們只關注英文技術點,好比python,tensorflow等,那就忽略中文內容。
使用正則表達式來匹配提取哪些由az小寫字母和AZ大寫字母加上0~9數字組成的單詞。github
import jieba words = jieba.lcut(text) import re pattern = re.compile(r'^[a-zA-Z0-1]+$') words = [w for w in words if pattern.match(w)] cuted = ' '.join(words) print(cuted[:100])
分好詞後就須要將詞作成詞雲了,使用的是wordclould正則表達式
from matplotlib import pyplot as plt from wordcloud import WordCloud string = 'Importance of relative word frequencies for font-size. With relative_scaling=0, only word-ranks are considered. With relative_scaling=1, a word that is twice as frequent will have twice the size. If you want to consider the word frequencies and not only their rank, relative_scaling around .5 often looks good.' font = r'C:\Windows\Fonts\FZSTK.TTF' wc = WordCloud(font_path=font, #若是是中文必需要添加這個,不然會顯示成框框 background_color='white', width=1000, height=800, ).generate(string) wc.to_file('ss.png') #保存圖片 plt.imshow(wc) #用plt顯示圖片 plt.axis('off') #不顯示座標軸 plt.show() #顯示圖片
#cell-1 text='' with open('./lagou-job1000-ai-details.txt','r') as f: text=f.read() f.close() print(text[:100]) #cell-2 import jieba words = jieba.lcut(text) import re pattern = re.compile(r'^[a-zA-Z0-1]+$') words = [w for w in words if pattern.match(w)] cuted = ' '.join(words) print(cuted[:500]) #cell-3 from wordcloud import WordCloud from wordcloud import ImageColorGenerator #它是直接用來生成一個color_func顏色函數的,它括號裏須要一個nd-array多維數組的圖像 fontpath='SourceHanSansCN-Regular.otf' import numpy as np from PIL import Image aimask=np.array(Image.open("ai-mask.png")) #獲取遮罩圖片,這個數據應該是nd-array格式,這是一個多維數組格式(N-dimensional Array)。 genclr=ImageColorGenerator(aimask) wc = WordCloud(font_path=fontpath, # 設置字體 background_color="white", # 背景顏色 max_words=1000, # 詞雲顯示的最大詞數 max_font_size=100, # 字體最大值 min_font_size=5, #字體最小值 random_state=42, #隨機數 collocations=False, #避免重複單詞 mask=aimask, #造型遮蓋 color_func=genclr, width=1600,height=1200,margin=2, #圖像寬高,字間距,須要配合下面的plt.figure(dpi=xx)放縮纔有效 ) wc.generate(cuted) #cell-4 import matplotlib.pyplot as plt plt.figure(dpi=150) #經過這裏能夠放大或縮小 plt.imshow(wc, interpolation='catrom',vmax=1000) plt.axis("off") #隱藏座標
下段代碼來自wordcloud官方的github。數組
#!/usr/bin/env python """ Colored by Group Example ======================== Generating a word cloud that assigns colors to words based on a predefined mapping from colors to words 基於顏色到單次的映射,將顏色分配給單次,生成詞雲。 """ from wordcloud import (WordCloud, get_single_color_func) import matplotlib.pyplot as plt class SimpleGroupedColorFunc(object): """Create a color function object which assigns EXACT colors to certain words based on the color to words mapping 建立一個顏色函數對象,它根據顏色到單詞的映射關係,爲單詞分配精準的顏色。 Parameters 參數 ---------- color_to_words : dict(str -> list(str)) A dictionary that maps a color to the list of words. default_color : str Color that will be assigned to a word that's not a member of any value from color_to_words. """ def __init__(self, color_to_words, default_color): self.word_to_color = {word: color for (color, words) in color_to_words.items() for word in words} self.default_color = default_color def __call__(self, word, **kwargs): return self.word_to_color.get(word, self.default_color) class GroupedColorFunc(object): """Create a color function object which assigns DIFFERENT SHADES of specified colors to certain words based on the color to words mapping. Uses wordcloud.get_single_color_func Parameters ---------- color_to_words : dict(str -> list(str)) A dictionary that maps a color to the list of words. default_color : str Color that will be assigned to a word that's not a member of any value from color_to_words. """ def __init__(self, color_to_words, default_color): self.color_func_to_words = [ (get_single_color_func(color), set(words)) for (color, words) in color_to_words.items()] self.default_color_func = get_single_color_func(default_color) def get_color_func(self, word): """Returns a single_color_func associated with the word""" try: color_func = next( color_func for (color_func, words) in self.color_func_to_words if word in words) except StopIteration: color_func = self.default_color_func return color_func def __call__(self, word, **kwargs): return self.get_color_func(word)(word, **kwargs) #text是要分析的文本內容 text = """The Zen of Python, by Tim Peters Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Flat is better than nested. Sparse is better than dense. Readability counts. Special cases aren't special enough to break the rules. Although practicality beats purity. Errors should never pass silently. Unless explicitly silenced. In the face of ambiguity, refuse the temptation to guess. There should be one-- and preferably only one --obvious way to do it. Although that way may not be obvious at first unless you're Dutch. Now is better than never. Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those!""" # Since the text is small collocations are turned off and text is lower-cased wc = WordCloud(collocations=False).generate(text.lower()) # 自定義全部單詞的顏色 color_to_words = { # words below will be colored with a green single color function '#00ff00': ['beautiful', 'explicit', 'simple', 'sparse', 'readability', 'rules', 'practicality', 'explicitly', 'one', 'now', 'easy', 'obvious', 'better'], # will be colored with a red single color function 'red': ['ugly', 'implicit', 'complex', 'complicated', 'nested', 'dense', 'special', 'errors', 'silently', 'ambiguity', 'guess', 'hard'] } # Words that are not in any of the color_to_words values # will be colored with a grey single color function #不屬於上述設定的顏色詞的詞語會用灰色來着色 default_color = 'grey' # Create a color function with single tone # grouped_color_func = SimpleGroupedColorFunc(color_to_words, default_color) # Create a color function with multiple tones grouped_color_func = GroupedColorFunc(color_to_words, default_color) # Apply our color function # 若是你也能夠將color_func的參數設置爲圖片,詳細的說明請看 下一部分 wc.recolor(color_func=grouped_color_func) # 畫圖 plt.figure() plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show()
該段代碼主要來自於wordcloud的github,你一樣能夠在github下載該例子以及原圖片與效果圖。wordcloud會把背景圖中白色區域去除,只在有色區域進行繪製。app
#!/usr/bin/env python """ Image-colored wordcloud ======================= You can color a word-cloud by using an image-based coloring strategy implemented in ImageColorGenerator. It uses the average color of the region occupied by the word in a source image. You can combine this with masking - pure-white will be interpreted as 'don't occupy' by the WordCloud object when passed as mask. If you want white as a legal color, you can just pass a different image to "mask", but make sure the image shapes line up. """ #導入必要的庫 from os import path from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # Read the whole text. text = open(r'C:\Users\JluTIger\Desktop\texten.txt').read() # read the mask / color image taken from # http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010 alice_coloring = np.array(Image.open(r"C:\Users\JluTIger\Desktop\alice.png")) # 設置停用詞 stopwords = set(STOPWORDS) stopwords.add("said") # 你能夠經過 mask 參數 來設置詞雲形狀 wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, stopwords=stopwords, max_font_size=40, random_state=42) # generate word cloud wc.generate(text) # create coloring from image image_colors = ImageColorGenerator(alice_coloring) # show # 在只設置mask的狀況下,你將會獲得一個擁有圖片形狀的詞雲 plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.figure() # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor # 咱們還能夠直接在構造函數中直接給顏色 # 經過這種方式詞雲將會按照給定的圖片顏色佈局生成字體顏色策略 plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") plt.axis("off") plt.figure() plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear") plt.axis("off") plt.show()
原圖
less
效果:
dom
參考連接ide