在window下使用gemsim.models.word2vec.LineSentence加載中文維基百科語料庫(已分詞)時報以下錯誤:編碼
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 0: invalid continuation byte
這種編碼問題真的很讓人頭疼,這種問題都是出如今xxx.decode("utf-8")的時候,因此接下來咱們來看看gensim中的源碼:url
class LineSentence(object): """Iterate over a file that contains sentences: one line = one sentence. Words must be already preprocessed and separated by whitespace. """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ Parameters ---------- source : string or a file-like object Path to the file on disk, or an already-open file object (must support `seek(0)`). limit : int or None Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> sentences = LineSentence(datapath('lee_background.cor')) >>> for sentence in sentences: ... pass """ self.source = source self.max_sentence_length = max_sentence_length self.limit = limit def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in itertools.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length
從源碼中能夠看到__iter__方法讓LineSentence成爲了一個可迭代的對象,並且文件讀取的方法也都定義在__iter__方法中。通常咱們輸入的source參數都是一個文件路徑(也就是一個字符串形式),所以在try時,self.source.seek(0)會報「字符串沒有seek方法」的錯,因此真正執行的代碼是在except中。spa
接下來咱們有兩種方法來解決咱們的問題:code
1)from gensim import utilsorm
utils.samrt_open(url, mode="rb", **kw)對象
在源碼中用utils.smart_open()方法打開文件時默認是用二進制的形式打開的,能夠將mode=「rb」 改爲mode=「r」。blog
2)from gensim import utilstoken
utils.to_unicode(text, encoding='utf8', errors='strict')ip
在源碼中在decode("utf8")時,其默認errors=「strict」, 能夠將其改爲errors="ignore"。即utils.to_unicode(line, errors="ignore")utf-8
不過建議你們不要直接在源碼上修改,能夠直接將源碼複製下來,例如:
import logging import itertools import gensim from gensim.models import word2vec from gensim import utils logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) class LineSentence(object): """Iterate over a file that contains sentences: one line = one sentence. Words must be already preprocessed and separated by whitespace. """ def __init__(self, source, max_sentence_length=10000, limit=None): """ Parameters ---------- source : string or a file-like object Path to the file on disk, or an already-open file object (must support `seek(0)`). limit : int or None Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> sentences = LineSentence(datapath('lee_background.cor')) >>> for sentence in sentences: ... pass """ self.source = source self.max_sentence_length = max_sentence_length self.limit = limit def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in itertools.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source, mode="r") as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length our_sentences = LineSentence("./zhwiki_token.txt") model = gensim.models.Word2Vec(our_sentences, size=200, iter=30) # 大語料,用CBOW,適當的增大迭代次數 # model.save(save_model_file) model.save("./mathWord2Vec" + ".model") # 以該形式保存模型以便以後能夠繼續增量訓練