FMM和BMM的python代碼實現

時間 2019-12-08

標籤 fmm bmm python 代碼實現欄目 Python 简体版

原文原文鏈接

FMM和BMM的python代碼實現

FMM和BMM的編程實現，其實兩個算法思路都挺簡單，一個是從前取最大詞長度的小分句，查找字典是否有該詞，若無則分句去掉最後面一個字，再次查找，直至分句變成單詞或者在字典中找到，並將其去除，而後重複上述步驟。BMM則是從後取分句，字典中不存在則分句最前去掉一個字，也是重複相似的步驟。python

readCorpus.py算法

import sys
output = {}
with open('語料庫.txt', mode='r', encoding='UTF-8') as f:
    for line in f.readlines():
        if line is not None:
            # 去除每行的換行符
            t_line = line.strip('\n')
            # 按空格分開每一個詞
            words = t_line.split(' ')
            for word in words:
                # 按/分開標記和詞
                t_word = word.split('/')
                # 左方括號去除
                tf_word = t_word[0].split('[')
                if len(tf_word) == 2:
                    f_word = tf_word[1]
                else:
                    f_word = t_word[0]
                # 若在輸出字典中，則value+1
                if f_word in output.keys():
                    output[f_word] = output[f_word]+1
                # 不在輸出字典中則新建
                else:
                    output[f_word] = 1
            big_word1 = t_line.split('[')
            for i in range(1, len(big_word1)):
                big_word2 = big_word1[i].split(']')[0]
                words = big_word2.split(' ')
                big_word = ""
                for word in words:
                    # 按/分開標記和詞
                    t_word = word.split('/')
                    big_word = big_word + t_word[0]
                # 若在輸出字典中，則value+1
                if big_word in output.keys():
                    output[big_word] = output[big_word]+1
                # 不在輸出字典中則新建
                else:
                    output[big_word] = 1

f.close()

with open('output.txt', mode='w', encoding='UTF-8') as f:
    while output:
        minNum = sys.maxsize
        minName = ""
        for key, values in output.items():
            if values < minNum:
                minNum = values
                minName = key
        f.write(minName+": "+str(minNum)+"\n")
        del output[minName]
f.close()

BMM.py編程

MAX_WORD = 19
word_list = []
ans_word = []
with open('output.txt', mode='r', encoding='UTF-8')as f:
    for line in f.readlines():
        if line is not None:
            word = line.split(':')
            word_list.append(word[0])
f.close()
#num = input("輸入句子個數：")
#for i in range(int(num)):
while True:
    ans_word = []
    try:
        origin_sentence = input("輸入：\n")
        while len(origin_sentence) != 0:
            len_word = MAX_WORD
            while len_word > 0:
                # 從後讀取最大詞長度的數據，若該數據在字典中，則存入數組，並將其去除
                if origin_sentence[-len_word:] in word_list:
                    ans_word.append(origin_sentence[-len_word:])
                    len_sentence = len(origin_sentence)
                    origin_sentence = origin_sentence[0:len_sentence-len_word]
                    break
                # 不在詞典中，則從後取詞長度-1
                else:
                    len_word = len_word - 1
            # 單詞直接存入數組
            if len_word == 0:
                if origin_sentence[-1:] != ' ':
                    ans_word.append(origin_sentence[-1:])
                len_sentence = len(origin_sentence)
                origin_sentence = origin_sentence[0:len_sentence - 1]
        for j in range(len(ans_word)-1, -1, -1):
            print(ans_word[j] + '/', end='')
        print('\n')
    except (KeyboardInterrupt, EOFError):
        break

FMM.py數組

MAX_WORD = 19
word_list = []
with open('output.txt', mode='r', encoding='UTF-8')as f:
    for line in f.readlines():
        if line is not None:
            word = line.split(':')
            word_list.append(word[0])
f.close()
#num = input("輸入句子個數：")
#for i in range(int(num)):
while True:
    try:
        origin_sentence = input("輸入：\n")
        while len(origin_sentence) != 0:
            len_word = MAX_WORD
            while len_word > 0:
                # 讀取前最大詞長度數據，在數組中則輸出，並將其去除
                if origin_sentence[0:len_word] in word_list:
                    print(origin_sentence[0:len_word]+'/', end='')
                    origin_sentence = origin_sentence[len_word:]
                    break
                # 不在字典中，則讀取長度-1
                else:
                    len_word = len_word - 1
            # 爲0則表示爲單詞，輸出
            if len_word == 0:
                if origin_sentence[0] != ' ':
                    print(origin_sentence[0]+'/', end='')
                origin_sentence = origin_sentence[1:]
        print('\n')
    except (KeyboardInterrupt, EOFError):
        break