互聯網時代的社會語言學：基於SNS的文本數據挖掘（python）

時間 2019-12-20

標籤互聯網時代社會語言學基於 sns 文本數據挖掘 python 欄目 Python 简体版

原文原文鏈接

# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math
import sys
reload(sys)
sys.setdefaultencoding("utf8") 
 
def info_entropy(words):
    result = 0 
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result
 
max_word_len = 5            #字個數閾值
entropy_threshold = 1        #商閾值
num_threshold = 3;          #詞個數閾值
con_threshold = 100;        #粘貼度閾值，越大，則是詞的機率越大
txtin = u'luxunzawen.txt'
txtout = u'luxunzawen_out.txt'
f = open(txtin,'r').read().decode('utf-8')

bef_sentences = []
bef_sentences = re.split("\W+|[a-zA-Z0-9]+|\s+|\n+", f, 0, re.UNICODE)
sentences = [one for one in bef_sentences if len(one)>1]
print r"sentences:finish"

freq = collections.Counter()                        #計數器容器
for sentence in sentences:
    if sentence:
        l = len(sentence)
        wl = min(l, max_word_len)
        for i in xrange(1, wl + 1): 
            for j in xrange(0, l - i + 1): 
                freq[sentence[j:j + i]] += 1
                
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
    ps[word] = float(val) / total               #每一個詞的詞頻
 
words = set()
for word, word_p in ps.items():
    if len(word) > 1:
        p = 0
        for i in xrange(1, len(word)):
            t = ps[word[0:i]] * ps[word[i:]]
            p = max(p, t)
        if freq[word] >= num_threshold and word_p / p > con_threshold:    #詞頻大於3，粘合度越大越好
            words.add(word)
 
final_words = set()
for word in words:
    lf = rf = True
    left_words = collections.Counter()        
    right_words = collections.Counter()
    pattern = re.compile(word.join(['.?', '.?']))
    for sentence in sentences:
        l = pattern.findall(sentence)
        flag_l = flag_r = False
        if l:
            if l[0][0] != word[0]:
                left_words[l[0][0]] += 1
                flag_l = True
            else:
                lf = False or flag_l
            if l[0][-1] != word[-1]:
                right_words[l[0][-1]] += 1    #有右漢字
                flag_r = True
            else:
                rf = False or flag_r
                
                
    left_info_entropy = info_entropy(left_words)
    right_info_entropy = info_entropy(right_words)
    
    if  lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
        continue
    if  rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
        continue
    final_words.add(word)
    
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
#for word in words_list:
 #   print word.encode('gb2312'), freq[word]

    
fout = open(txtout, 'w')
for word in words_list:
    result = word+u"   "+str(freq[word])+"\n"
    fout.write(result)
    

print('Done')