6.中文詞頻統計

時間 2020-04-07

標籤中文詞頻統計简体版

原文原文鏈接

import jieba

f = open('sanguoyanyi.txt', 'r',encoding='utf-8')
text = f.read()
f.close()

jieba.add_word('曹操')
jieba.add_word('諸葛亮')
jieba.add_word('孔明')
punctuation = '''，。‘’「」：；（）！？、 '''
a = {'的','\n','\u3000','曰','之','不','人','軍','操','一','將',
'大','馬','來','德','有','於','下','兵','此',
'玄','公','見','爲','何','中','而','可','吾',
'出','也','以','與','上','後','今','其','去',
'日','明','言'}
for i in punctuation:
text = text.replace(i, '')
print(list(jieba.cut(text)))
tempwords = list(jieba.cut(text))
print(tempwords)
count = {}
words = list(set(tempwords) - a)
print(words)

for i in range(0, len(words)):
count[words[i]] = text.count(str(words[i]))

countList = list(count.items())
countList.sort(key=lambda x: x[1], reverse=True)
print(countList)

f = open('zzzCount.txt', 'a')
for i in range(20):
f.write(countList[i][0] + ':' + str(countList[i][1]) + '\n')
f.close()utf-8

相關標籤/搜索