tensorflow實現Word2Vec(找到目標英文單詞的相近詞)

根據本身的理解寫的讀書筆記。

import collections
import math
import os
import random
import zipfile
import urllib
import numpy as np
import tensorflow as tf

#定義下載文本數據的函數
# url = 'http://mattmahoney.net/dc/'
#
# def maybe_download(filename,expected_bytes):
# if not os.path.exists(filename):
# filename,_ = urllib.request.urlretrieve(url + filename,filename)
# statinfo = os.stat(filename) #訪問一個文件的詳細信息。
# if statinfo.st_size == expected_bytes: #文件大小(以字節爲單位)
# print('Found and verified(驗證)',filename)
# else:
# print(statinfo.st_size)
# raise Exception('Failed to verify(驗證)' + filename + 'Can you get to it with a browser(瀏覽器)?')
# return filename
#
# filename = maybe_download('text8.zip',31344016)
 html

filename = './text8.zip'瀏覽器

我有幾張阿里雲幸運券分享給你,用券購買或者升級阿里雲相應產品會有特惠驚喜哦!把想要買的產品的幸運券都領走吧!快下手,立刻就要搶光了。app


#解壓文件,並將數據轉化成單詞的列表
def read_data(filename):
with zipfile.ZipFile(filename) as f:
#得到名字列表,讀取成字符串,編碼成'utf-8',最後進行分割
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data

words = read_data(filename)
# print('Data size',len(words))
# print(words)

#建立詞彙表,將出現最多的50000個單詞做爲詞彙表,放入字典中。
vocabulary_size = 50000

def build_dataset(words):
count = [['UNK',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
# c=collections.Counter(words).most_common(10)
# print(c)
# count.extend(c)
# print(count) #[['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)]
dictionary = dict()#新建空字典
for word,_ in count:
dictionary[word] = len(dictionary)
# print(dictionary) #{'UNK': 0, 'the': 1, 'of': 2, 'and': 3, 'one': 4, 'in': 5, 'a': 6, 'to': 7, 'zero': 8, 'nine': 9, 'two': 10}
data = list()
unk_count = 0#未知單詞數量
for word in words:#單詞索引,不在字典中,則索引爲0
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))dom

return data,count,dictionary,reverse_dictionary函數

相關文章
相關標籤/搜索