針對的問題:git
所提方法的思想:github
模型:app
x是第一層輸入層的激活向量,來自矩陣C的輸入詞特徵的鏈接: 分佈式
代碼:來自https://github.com/graykode/nlp-tutorial/tree/master/1-1.NNLM函數
# code by Tae Hwan Jung @graykode import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.autograd import Variable dtype = torch.FloatTensor sentences = [ "i like dog", "i love coffee", "i hate milk"] word_list = " ".join(sentences).split() #製做詞彙表 print(word_list) word_list = list(set(word_list)) #去重 print("after set: ",word_list) word_dict = {w: i for i, w in enumerate(word_list)} #每一個單詞對應的索引 number_dict = {i: w for i, w in enumerate(word_list)} #每一個索引對應的單詞 n_class = len(word_dict) # 單詞總數 # NNLM Parameter n_step = 2 # 根據前兩個單詞預測第3個單詞 n_hidden = 2 # h 隱藏層神經元的個數 m = 2 # m 詞向量的維度 # 因爲pytorch中輸入的數據是以batch小批量進行輸入的,下面的函數就是將原始數據以一個batch爲基本單位餵給模型 def make_batch(sentences): input_batch = [] target_batch = [] for sen in sentences: word = sen.split() input = [word_dict[n] for n in word[:-1]] target = word_dict[word[-1]] input_batch.append(input) target_batch.append(target) return input_batch, target_batch # Model class NNLM(nn.Module): def __init__(self): super(NNLM, self).__init__() self.C = nn.Embedding(n_class, m) self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype)) self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype)) self.d = nn.Parameter(torch.randn(n_hidden).type(dtype)) self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype)) self.b = nn.Parameter(torch.randn(n_class).type(dtype)) def forward(self, X): X = self.C(X) X = X.view(-1, n_step * m) # [batch_size, n_step * n_class] tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden] output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class] return output model = NNLM() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) input_batch, target_batch = make_batch(sentences) input_batch = Variable(torch.LongTensor(input_batch)) target_batch = Variable(torch.LongTensor(target_batch)) # 對於每一個batch大都執行了如下這樣的操做,可理解爲梯度降低法 # Training for epoch in range(5000): optimizer.zero_grad() # 把梯度置零,也就是把loss關於weight的導數變成0. output = model(input_batch) #前向傳播求出預測的值 # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot) loss = criterion(output, target_batch) # 求loss if (epoch + 1)%1000 == 0: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) loss.backward() # 反向傳播求梯度 optimizer.step() # 更新全部參數 # 爲何調用backward()函數以前都要將梯度清零 # 由於若是梯度不清零,pytorch中會將上次計算的梯度和本次計算的梯度累加。 # 這樣邏輯的好處--當咱們的硬件限制不能使用更大的bachsize時,使用屢次計算較小的bachsize的梯度平均值來代替,更方便, # 壞處--每次都要清零梯度 # Predict predict = model(input_batch).data.max(1, keepdim=True)[1] # Test print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])
結果:spa