針對結巴分詞Memory Error的兩種解決方式

針對結巴分詞Memory Error的兩種解決方式

1、背景

​ 最近,在使用Gensim Word2vec根據特定語料訓練近義詞模型,模型訓練輸入語料要求是分詞以後的文件。使用結巴jieba對原始語料文件進行分詞,在分詞過程當中,因爲語料文件太大,將近五千萬的數據量,出現了Memory Error問題。針對此問題,提供如下兩種解決方式。同時,代碼中展現了分詞時對詞語詞性的篩選,停用詞及標點符號的過濾。最後,附上根據分詞文件進行模型訓練代碼。python

2、解決方式

​ 解決思路:一是在讀取文件數據時避免一次性所有加載數據,單線程按行加載處理數據;二是將存儲有大數據量的一個文件拆分爲多個,多線程並行分詞。多線程

2.1 第一種按行加載處理數據的解決方案代碼

# -*- coding: utf-8 -*-
""" 由原始文本進行分詞後保存到新的文件 """
import jieba
import numpy as np
import jieba.posseg as pseg
import re

filePath='/data/work/keyword/work_data/work_title_description.csv'
fileSegWordDonePath ='/data/work/keyword/work_cutdata/corpus_line.txt'

#停用詞加載
stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
    return stopwords
    
# 打印中文列表
def PrintListChinese(list):
    for i in range(len(list)):
        print (list[i])
        
# 讀取文件內容到列表
fileTrainRead = []
with open(filePath,'r') as fileTrainRaw:
    for line in fileTrainRaw:  # 按行讀取文件
        fileTrainRead.append(line)
    
# jieba分詞後保存在列表中
fileTrainSeg=[]
jieba.enable_paddle() 
stopwords = stopwordslist(stop_word_path)  # 這裏加載停用詞的路徑
outstr = ''
for i in range(len(fileTrainRead)):
    for x in pseg.cut(fileTrainRead[i][0:],use_paddle=True):
        #下方判斷表示選取指定詞性詞語
        if x.flag == 'n' or x.flag == 'nw' or x.flag == 'nz' or x.flag.startswith('TIME') or x.flag.startswith('t'):
            if x.word not in stopwords:
                #去除標點符號
                y = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", x.word)
                if y != '\t':
                    outstr += y 
                    outstr += " " 
    if i % 100 == 0:
        print(i)                    
fileTrainSeg.append([outstr])

# 保存分詞結果到文件中
with open(fileSegWordDonePath,'w',encoding='utf-8') as fW:
    for i in range(len(fileTrainSeg)):
        fW.write(fileTrainSeg[i][0])
        fW.write('\n')
      
複製代碼

## 2.2 第二種將存儲有大數據量的一個文件拆分爲多個的解決方案代碼
複製代碼
# -*-coding:utf-8 -*-
    import jieba.analyse
    import jieba
    import os
    import jieba.posseg as pseg
    
    jieba.enable_parallel(4)
    raw_data_path = '/data/work/keyword/work_data/'
    cut_data_path = '/data/work/keyword/work_cutdata/'
    stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
        return stopwords
    def cut_word(raw_data_path, cut_data_path ):
        #讀取該路徑下的多個數據文件
        data_file_list = os.listdir(raw_data_path)
        corpus = ''
        temp = 0
        for file in data_file_list:
            with open(raw_data_path + file,'rb') as f:
                print(temp+1)
                temp +=1
                document = f.read()
                document_cut = jieba.cut(document, cut_all=False)
                result = ' '.join(document_cut)
                corpus += result
        with open(cut_data_path + 'corpus.txt', 'w+', encoding='utf-8') as f:
            f.write(corpus)  # 讀取的方式和寫入的方式要一致
        stopwords = stopwordslist(stop_word_path)  # 加載停用詞的路徑
        with open(cut_data_path + 'corpus.txt', 'r', encoding='utf-8') as f:
            document_cut = f.read()
            outstr = ''
            for word in document_cut:
                if word not in stopwords:
                    if word != '\t':
                        outstr += word
                        outstr += " "
        with open(cut_data_path + 'corpus1.txt', 'w+', encoding='utf-8') as f:
                f.write(outstr)  # 讀取的方式和寫入的方式要一致
    if __name__ == "__main__":
        cut_word(raw_data_path, cut_data_path )
複製代碼

#3、使用Gensim Word2vec訓練模型app

""" gensim word2vec獲取詞向量 """
import warnings
import logging
import os.path
import sys
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# 忽略警告
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0]) # 讀取當前文件的文件名
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # inp爲輸入語料, outp1爲輸出模型, outp2爲vector格式的模型
    inp = '/data/work/keyword/work_cutdata/corpus_line.txt'
    out_model = '/data/work/keyword/word2vec_model/work_title_description.model'
    out_vector = '/data/work/keyword/word2vec_model/work_title_description.vector'
    # 訓練skip-gram模型
    model = Word2Vec(LineSentence(inp), size=50, window=5, min_count=5,
                     workers=multiprocessing.cpu_count())
    # 保存模型
    model.save(out_model)
    # 保存詞向量
    model.wv.save_word2vec_format(out_vector, binary=False)

複製代碼

4、總結

​ 在開發過程當中,最終使用第一種按行讀取文件數據的方式進行分詞並訓練獲得模型。第二種方式讀取的是目錄下的多個文件,測試時分了20個文件分別讀取,Memory Error問題再也不出現。測試

參考:大數據

blog.csdn.net/lilong11719…>spa

blog.csdn.net/qq_35273499….net

做者:易企秀工程師 Emma線程

相關文章
相關標籤/搜索