import re import math import copy ''' 需求:統計單詞出現的次數 1.從dream.txt文件中讀取數據 2.處理每一行的特殊符號 3.將每一行經過空格進行分隔,存入words列表中 4.統計單詞的出現次數,使用dict數據結構 # 列表 集合 RE模塊 字典 ''' ''' \xa0 是不間斷空白符 咱們一般所用的空格是 \x20 ,是在標準ASCII可見字符 0x20~0x7e 範圍內。 而 \xa0 屬於 latin1 (ISO/IEC_8859-1)中的擴展字符集字符,表明空白符nbsp(non-breaking space)。 \u3000:全角的空白符 ''' r = re.compile('[\xa0\u3000\n-\.!@#\$%\\\^&\*\)\(\+=\{\}\[\]\/",\'<>~\·`\?:;|]') words = [] with open("dream.txt", "r", encoding = "utf-8") as f: lines = f.readlines() for line in lines: line = r.sub(' ', line) for elem in line.split(' '): if elem: words.append(elem.strip(r'\u3000')) def word_count(wrods, set_words): words_count = {} for w in set_words: words_count[w] = 0 for w in words: words_count[w] += 1 return copy.deepcopy(words_count) set_words = set(words) words_count = word_count(words, set_words) print("總共單詞個數(含重複):", len(words)) print("總共單詞個數(不含重複):", len(set_words)) print("單詞和出現的次數:", words_count)