1. 下載wiki中文分詞語料 使用迅雷下載會快很多,大小爲1個多Gnode
opencc-1.0.1-win64.7z |
並解壓放置到自定義的目錄下python
space = b' '#原來是space = ' '
for text in wiki.get_texts():
s=space.join(text)
s=s.decode('utf8') + "\n"
output.write(s)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 修改後的代碼以下:
import logging
import os.path
import sys
from gensim.corpora importWikiCorpus
if __name__ =='__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s"%' '.join(sys.argv))
# check and process input arguments
if len(sys.argv)<3:
print(globals()['__doc__']% locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
space = b' '
i =0
output = open(outp,'w',encoding='utf-8')
wiki =WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
s=space.join(text)
s=s.decode('utf8')+"\n"
output.write(s)
i = i +1
if(i %10000==0):
logger.info("Saved "+ str(i)+" articles")
output.close()
logger.info("Finished Saved "+ str(i)+" articles")
import codecs,sys
import opencc
f=codecs.open('zh.wiki.txt','r',encoding="utf8")
line=f.readline()
print(line)
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs,sys
def cut_words(sentence):
#print sentence
return" ".join(jieba.cut(sentence)).encode('utf-8')
f=codecs.open('zh.jian.wiki.txt','r',encoding="utf8")
target = codecs.open("zh.jian.wiki.seg.txt",'w',encoding="utf8")
print('open files')
line_num=1
line = f.readline()
while line:
print('---- processing ', line_num,' article----------------')
line_seg =" ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num +1
line = f.readline()
f.close()
target.close()
exit()
while line:
curr =[]
for oneline in line:
#print(oneline)
curr.append(oneline)
after_cut = map(cut_words, curr)
target.writelines(after_cut)
print('saved ',line_num,' articles')
exit()
line = f.readline1()
f.close()
target.close()
python train_word2vec_model.py zh.jian.wiki.seg.txt wiki.zh.text.model wiki.zh.text.vector
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora importWikiCorpus
from gensim.models importWord2Vec
from gensim.models.word2vec importLineSentence
if __name__ =='__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s"%' '.join(sys.argv))
# check and process input arguments
if len(sys.argv)<4:
print(globals()['__doc__']% locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model =Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.save_word2vec_format(outp2, binary=False)