import jieba f = open('article.txt','r') text = f.read() f.close() str = '''一!「」,。?;’"',.、:\n''' for s in str: text = text.replace(s,' ') jieba.add_word('錢先生') wordlist = list(jieba.cut(text)) exclude = {'說','有','得','沒','的','他','了','她','是','在','—','你','走','對','他們','着','把','不','也','我','人','而', '與','就','但是','那','要','又','想','和','一個',' ','呢','很','一點','都','去', '沒有','個','上','給','來','還','到','這','\u3000','點','小','看'} set = set(wordlist) - exclude dict = {} for key in set: dict[key]=wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) for i in range(20): print(dictlist[i])
運行結果python
('日本', 665)
('本身', 647)
('什麼', 608)
('老人', 536)
('瑞宣', 422)
('好', 415)
('知道', 394)
('北平', 350)
('錢', 338)
('起來', 295)
('錢先生', 291)
('裏', 291)
('先生', 290)
('並', 286)
('象', 284)
('能', 282)
('似的', 280)
('那麼', 279)
('不能', 279)
('會', 267)blog