python簡單的分析文本

時間 2019-12-07

原文原文鏈接

import collections
import re


#讀取tips.txt文件內容，type(mytips)=str
with open("tips.txt","r",encoding="utf-8") as tip:
    
    mytips=tip.read().lower()

#正則去除非中英文字符，
strip_file=re.sub(r"\W+","",mytips)
print("正則去除非中英文字符:\n{}".format(strip_file))
print()

#篩選出全部英文單詞
only_enlish=re.findall(r'[a-z]+',mytips)
print('篩選出全部英文單詞:\n{}'.format(only_enlish))

#篩選出全部的中文
only_chinese=re.sub(r"[a-z1-9\W]+",'',mytips)
only_chinese_split=[c for c in only_chinese]
print('篩選出全部的中文\n{}'.format(only_chinese_split))

#若是most_common()參數爲空，則按照從高頻到低頻依次所有打印
most_comm_word=collections.Counter(only_enlish).most_common(5)
print("打印頻率最高的五個字符{}".format(most_comm_word))


#sorted（iterable,key,reverse=False)
low_comm_word=sorted(most_comm_word,key=lambda item:item[1])
print("反序輸出most_comm_word{}".format(low_comm_word))

#filter(function,iterable)
specified_most_comm_word=list(filter(lambda item: True if item[1]<5 and item[1]>=3 else False,most_comm_word))
print("打印(大於等於3小於4）指定值的most_comm_word{}".format(specified_most_comm_word))

#轉化成list取得word元素列表
dict_most_comm_word=dict(most_comm_word)
print('轉化成字典：{}'.format(dict_most_comm_word))

#zip在python3中是惰性計算，須要轉化成list
word,count=list(zip(*most_comm_word))
print('單獨打印word：{}'.format(word))
print("單獨打印count:{}".format(count))

#defaultdict簡單應用
#分析單詞出現的位置列表
enlish_dict=collections.defaultdict(list)
for k,v in enumerate(only_enlish):
    enlish_dict[v].append(k)
print('統計每一個單詞出現的位置：{}'.format(enlish_dict))


#orderdict簡單應用
#單詞從a-z進行排序
order_english_dict=collections.OrderedDict(sorted(enlish_dict.items(), key=lambda i :i[0]) )
print('單詞從a-z進行排序:\n{}'.format(order_english_dict))

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。