最近,在使用Gensim Word2vec根據特定語料訓練近義詞模型,模型訓練輸入語料要求是分詞以後的文件。使用結巴jieba對原始語料文件進行分詞,在分詞過程當中,因爲語料文件太大,將近五千萬的數據量,出現了Memory Error問題。針對此問題,提供如下兩種解決方式。同時,代碼中展現了分詞時對詞語詞性的篩選,停用詞及標點符號的過濾。最後,附上根據分詞文件進行模型訓練代碼。python
解決思路:一是在讀取文件數據時避免一次性所有加載數據,單線程按行加載處理數據;二是將存儲有大數據量的一個文件拆分爲多個,多線程並行分詞。多線程
# -*- coding: utf-8 -*-
""" 由原始文本進行分詞後保存到新的文件 """
import jieba
import numpy as np
import jieba.posseg as pseg
import re
filePath='/data/work/keyword/work_data/work_title_description.csv'
fileSegWordDonePath ='/data/work/keyword/work_cutdata/corpus_line.txt'
#停用詞加載
stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
return stopwords
# 打印中文列表
def PrintListChinese(list):
for i in range(len(list)):
print (list[i])
# 讀取文件內容到列表
fileTrainRead = []
with open(filePath,'r') as fileTrainRaw:
for line in fileTrainRaw: # 按行讀取文件
fileTrainRead.append(line)
# jieba分詞後保存在列表中
fileTrainSeg=[]
jieba.enable_paddle()
stopwords = stopwordslist(stop_word_path) # 這裏加載停用詞的路徑
outstr = ''
for i in range(len(fileTrainRead)):
for x in pseg.cut(fileTrainRead[i][0:],use_paddle=True):
#下方判斷表示選取指定詞性詞語
if x.flag == 'n' or x.flag == 'nw' or x.flag == 'nz' or x.flag.startswith('TIME') or x.flag.startswith('t'):
if x.word not in stopwords:
#去除標點符號
y = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", x.word)
if y != '\t':
outstr += y
outstr += " "
if i % 100 == 0:
print(i)
fileTrainSeg.append([outstr])
# 保存分詞結果到文件中
with open(fileSegWordDonePath,'w',encoding='utf-8') as fW:
for i in range(len(fileTrainSeg)):
fW.write(fileTrainSeg[i][0])
fW.write('\n')
複製代碼
## 2.2 第二種將存儲有大數據量的一個文件拆分爲多個的解決方案代碼
複製代碼
# -*-coding:utf-8 -*-
import jieba.analyse
import jieba
import os
import jieba.posseg as pseg
jieba.enable_parallel(4)
raw_data_path = '/data/work/keyword/work_data/'
cut_data_path = '/data/work/keyword/work_cutdata/'
stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
return stopwords
def cut_word(raw_data_path, cut_data_path ):
#讀取該路徑下的多個數據文件
data_file_list = os.listdir(raw_data_path)
corpus = ''
temp = 0
for file in data_file_list:
with open(raw_data_path + file,'rb') as f:
print(temp+1)
temp +=1
document = f.read()
document_cut = jieba.cut(document, cut_all=False)
result = ' '.join(document_cut)
corpus += result
with open(cut_data_path + 'corpus.txt', 'w+', encoding='utf-8') as f:
f.write(corpus) # 讀取的方式和寫入的方式要一致
stopwords = stopwordslist(stop_word_path) # 加載停用詞的路徑
with open(cut_data_path + 'corpus.txt', 'r', encoding='utf-8') as f:
document_cut = f.read()
outstr = ''
for word in document_cut:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
with open(cut_data_path + 'corpus1.txt', 'w+', encoding='utf-8') as f:
f.write(outstr) # 讀取的方式和寫入的方式要一致
if __name__ == "__main__":
cut_word(raw_data_path, cut_data_path )
複製代碼
#3、使用Gensim Word2vec訓練模型app
""" gensim word2vec獲取詞向量 """
import warnings
import logging
import os.path
import sys
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
# 忽略警告
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
if __name__ == '__main__':
program = os.path.basename(sys.argv[0]) # 讀取當前文件的文件名
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# inp爲輸入語料, outp1爲輸出模型, outp2爲vector格式的模型
inp = '/data/work/keyword/work_cutdata/corpus_line.txt'
out_model = '/data/work/keyword/word2vec_model/work_title_description.model'
out_vector = '/data/work/keyword/word2vec_model/work_title_description.vector'
# 訓練skip-gram模型
model = Word2Vec(LineSentence(inp), size=50, window=5, min_count=5,
workers=multiprocessing.cpu_count())
# 保存模型
model.save(out_model)
# 保存詞向量
model.wv.save_word2vec_format(out_vector, binary=False)
複製代碼
在開發過程當中,最終使用第一種按行讀取文件數據的方式進行分詞並訓練獲得模型。第二種方式讀取的是目錄下的多個文件,測試時分了20個文件分別讀取,Memory Error問題再也不出現。測試
參考:大數據
blog.csdn.net/lilong11719…>spa
blog.csdn.net/qq_35273499….net
做者:易企秀工程師 Emma線程