# CalHamletV1.py def getText(): txt = open("hamlet.txt", "r").read() txt = txt.lower() for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~': txt = txt.replace(ch, " ") return txt hamletTxt = getText() words = hamletTxt.split() counts = {} for word in words: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(10): word, count = items[i] print("{0:<10}{1:>5}".format(word, count))
the 948 and 855 to 650 of 581 you 494 a 468 my 447 i 443 in 373 hamlet 361
# CalThreeKingdomsV1.py import jieba txt = open("threekingdoms.txt", "r", encoding="utf-8").read() words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(15): word, count = items[i] print("{0:<10}{1:>5}".format(word, count))
Building prefix dict from the default dictionary ... Loading model from cache /var/folders/mh/krrg51957cqgl0rhgnwyylvc0000gn/T/jieba.cache Loading model cost 1.030 seconds. Prefix dict has been built succesfully. 曹操 953 孔明 836 將軍 772 卻說 656 玄德 585 關公 510 丞相 491 二人 469 不可 440 荊州 425 玄德曰 390 孔明曰 390 不能 384 如此 378 張飛 358
#CalThreeKingdomsV2.py import jieba txt = open("threekingdoms.txt", "r", encoding="utf-8").read() excludes = {"將軍", "卻說", "荊州", "二人", "不可", "不能", "如此"} words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue elif word == "諸葛亮" or word == "孔明曰": rword = "孔明" elif word == "關公" or word == "雲長": rword = "關羽" elif word == "玄德" or word == "玄德曰": rword = "劉備" elif word == "孟德" or word == "丞相": rword = "曹操" else: rword = word counts[rword] = counts.get(rword, 0) + 1 for word in excludes: del counts[word] items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(10): word, count = items[i] print("{0:<10}{1:>5}".format(word, count))
曹操 1451 孔明 1383 劉備 1252 關羽 784 張飛 358 商議 344 如何 338 主公 331 軍士 317 呂布 300