import torch from torch.autograd import Variable import matplotlib.pyplot as plt x_data = Variable(torch.Tensor([[1.0], [2.0], [3.0]])) y_data = Variable(torch.Tensor([[2.0], [4.0], [6.0]])) class Model(torch.nn.Module): def __init__(self): """ In the constructor we instantiate two nn.Linear module """ super(Model, self).__init__() self.linear = torch.nn.Linear(1, 1) # One in and one out def forward(self, x): """ In the forward function we accept a Variable of input data and we must return a Variable of output data. We can use Modules defined in the constructor as well as arbitrary operators on Variables. """ y_pred = self.linear(x) return y_pred # our model model = Model()
# Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. # criterion 標準準則 主要用來計算loss criterion = torch.nn.MSELoss(size_average=False) # 優化器 optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
第三部分,進行訓練,forward -> backward -> update parameters網絡
# Training loop for epoch in range(1000): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x_data) # Compute and print loss loss = criterion(y_pred, y_data) print(epoch, loss.data[0]) # Zero gradients, perform a backward pass, and update the weights. # initialize the gradients optimizer.zero_grad() # 反向傳遞 loss.backward() # 更新優化器中的權重,即model.parrameters optimizer.step()
# After training hour_var = Variable(torch.Tensor([[4.0]])) y_pred = model(hour_var) print("predict (after training)", 4, model(hour_var).data[0][0])
graph LR x-->Linear Linear-->y
\hat{y} = x * w + b loss = \frac{1}{N}\sum_{n=1}^{N}(\hat{y_n}-y_n)^2
using sigmoid functions:
graph LR x --> Linear Linear --> Sigmoid Sigmoid --> y
Y 介於 [0,1] 之間, 這樣作能夠用來壓縮計算量,讓計算更加容易
\sigma(z) = \frac{1}{1+e^{-z}} \hat{y} = \sigma(x*w+b) loss=-\frac{1}{N}\sum_{n=1}^{N}y_nlog\hat{y_n} + (1-y_n)log(1-\hat{y_n})
import torch from torch.autograd import Variable import torch.nn.functional as F x_data = Variable(torch.Tensor([[1.0], [2.0], [3.0], [4.0],[5.0]])) y_data = Variable(torch.Tensor([[0.], [0.], [1.], [1.],[1.]])) class Model(torch.nn.Module): def __init__(self): """ In the constructor we instantiate nn.Linear module """ super(Model, self).__init__() self.linear = torch.nn.Linear(1, 1) # One in and one out def forward(self, x): """ In the forward function we accept a Variable of input data and we must return a Variable of output data. """ y_pred = F.sigmoid(self.linear(x)) return y_pred # our model model = Model() # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. criterion = torch.nn.BCELoss(size_average=True) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Training loop for epoch in range(400): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x_data) # Compute and print loss loss = criterion(y_pred, y_data) print(epoch, loss.data[0]) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step() # After training hour_var = Variable(torch.Tensor([[0.0]])) print("predict 1 hour ", 0.0, model(hour_var).data[0][0] > 0.5) hour_var = Variable(torch.Tensor([[7.0]])) print("predict 7 hours", 7.0, model(hour_var).data[0][0] > 0.5)
y_Pred = F.sigmoid(self.linear(x))
change loss into:
criterion = torch.nn.BCELoss(size_average=True)
ReLU是修正線性單元(The Rectified Linear Unit)的簡稱,近些年來在深度學習中使用得不少,能夠解決梯度彌散問題,由於它的導數等於1或者就是0。相對於sigmoid和tanh激勵函數,對ReLU求梯度很是簡單,計算也很簡單,能夠很是大程度地提高隨機梯度降低的收斂速度。(由於ReLU是線性的,而sigmoid和tanh是非線性的)。但ReLU的缺點是比較脆弱,隨着訓練的進行,可能會出現神經元死亡的狀況,例若有一個很大的梯度流經ReLU單元后,那權重的更新結果多是,在此以後任何的數據點都沒有辦法再激活它了。若是發生這種狀況,那麼流經神經元的梯度從這一點開始將永遠是0。也就是說,ReLU神經元在訓練中不可逆地死亡了。
ELU在正值區間的值爲x自己,這樣減輕了梯度彌散問題(x>0區間導數到處爲1),這點跟ReLU、Leaky ReLU類似。而在負值區間,ELU在輸入取較小值時具備軟飽和的特性,提高了對噪聲的魯棒性
Leaky ReLU主要是爲了不梯度消失,當神經元處於非激活狀態時,容許一個非0的梯度存在,這樣不會出現梯度消失,收斂速度快。它的優缺點跟ReLU相似。
graph LR a-->Linear b-->Linear Linear-->Sigmoid Sigmoid-->y
多維度,更層次的網絡,主要在Design your model using class 中進行的改變
import torch from torch.autograd import Variable import numpy as np xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32) x_data = Variable(torch.from_numpy(xy[:, 0:-1])) y_data = Variable(torch.from_numpy(xy[:, [-1]])) print(x_data.data.shape) print(y_data.data.shape) class Model(torch.nn.Module): def __init__(self): """ In the constructor we instantiate two nn.Linear module """ super(Model, self).__init__() self.l1 = torch.nn.Linear(8, 6) self.l2 = torch.nn.Linear(6, 4) self.l3 = torch.nn.Linear(4, 1) self.sigmoid = torch.nn.Sigmoid() def forward(self, x): """ In the forward function we accept a Variable of input data and we must return a Variable of output data. We can use Modules defined in the constructor as well as arbitrary operators on Variables. """ out1 = self.sigmoid(self.l1(x)) out2 = self.sigmoid(self.l2(out1)) y_pred = self.sigmoid(self.l3(out2)) return y_pred # our model model = Model() # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. #criterion = torch.nn.BCELoss(size_average=True) criterion = torch.nn.BCELoss(reduction='elementwise_mean') optimizer = torch.optim.SGD(model.parameters(), lr=0.1) # Training loop for epoch in range(1200000): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x_data) # Compute and print loss loss = criterion(y_pred, y_data) print(epoch, loss.item()) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step()
train_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True, num_workers=1)
# References # https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/pytorch_basics/main.py # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class import torch import numpy as np from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader class DiabetesDataset(Dataset): """ Diabetes dataset.""" # Initialize your data, download, etc. def __init__(self): xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32) self.len = xy.shape[0] self.x_data = torch.from_numpy(xy[:, 0:-1]) self.y_data = torch.from_numpy(xy[:, [-1]]) def __getitem__(self, index): return self.x_data[index], self.y_data[index] def __len__(self): return self.len dataset = DiabetesDataset() train_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True, num_workers=1) for epoch in range(2): for i, data in enumerate(train_loader, 0): # get the inputs inputs, labels = data # wrap them in Variable inputs, labels = Variable(inputs), Variable(labels) # Run your training process print(epoch, i, "inputs", inputs.data, "labels", labels.data)
MNist softmax
graph LR x{x} --> Linear Linear --> Activation Activation --> ... ... --> Linear2 Linear2-->Activation2 Activation2-->h{y}
graph LR x{x} --> Linear Linear --> Activation Activation --> ... ... --> Linear2 Linear2-->Activation2 Activation2-->P_y=0 Activation2-->P_y=1 Activation2-->.... Activation2-->P_y=10
what is softmax?
\sigma(z)_j = \frac{e^{z_j}}{\sum_{k=1}^{K}e^{z_k}} for j=1,2,...,k
using softmax to get probabilities.
what is corss entropy?
loss = \frac{1}{N}\sum_i D(Softmax(wx_i+b),Y_i) D(\hat{Y},Y) = -Ylog\hat{Y}
graph LR x--LinearModel-->Z Z--Softmax-->y' y'--Cross_Entropy-->Y
loss = torch.nn.CrossEntropyLoss()
graph LR X--Softmax-->y' y'--Cross_Entropy-->Y
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.autograd import Variable # Cross entropy example import numpy as np # One hot # 0: 1 0 0 # 1: 0 1 0 # 2: 0 0 1 Y = np.array([1, 0, 0]) Y_pred1 = np.array([0.7, 0.2, 0.1]) Y_pred2 = np.array([0.1, 0.3, 0.6]) print("loss1 = ", np.sum(-Y * np.log(Y_pred1))) print("loss2 = ", np.sum(-Y * np.log(Y_pred2))) ################################################################################ # Softmax + CrossEntropy (logSoftmax + NLLLoss) loss = nn.CrossEntropyLoss() # target is of size nBatch # each element in target has to have 0 <= value < nClasses (0-2) # Input is class, not one-hot Y = Variable(torch.LongTensor([0]), requires_grad=False) # input is of size nBatch x nClasses = 1 x 4 # Y_pred are logits (not softmax) Y_pred1 = Variable(torch.Tensor([[2.0, 1.0, 0.1]])) Y_pred2 = Variable(torch.Tensor([[0.5, 2.0, 0.3]])) l1 = loss(Y_pred1, Y) l2 = loss(Y_pred2, Y) print("PyTorch Loss1 = ", l1.data, "\nPyTorch Loss2=", l2.data) print("Y_pred1=", torch.max(Y_pred1.data, 1)[1]) print("Y_pred2=", torch.max(Y_pred2.data, 1)[1]) ################################################################################ """Batch loss""" # target is of size nBatch # each element in target has to have 0 <= value < nClasses (0-2) # Input is class, not one-hot Y = Variable(torch.LongTensor([2, 0, 1]), requires_grad=False) # input is of size nBatch x nClasses = 2 x 4 # Y_pred are logits (not softmax) Y_pred1 = Variable(torch.Tensor([[0.1, 0.2, 0.9], [1.1, 0.1, 0.2], [0.2, 2.1, 0.1]])) Y_pred2 = Variable(torch.Tensor([[0.8, 0.2, 0.3], [0.2, 0.3, 0.5], [0.2, 0.2, 0.5]])) l1 = loss(Y_pred1, Y) l2 = loss(Y_pred2, Y) print("Batch Loss1 = ", l1.data, "\nBatch Loss2=", l2.data)
做業:CrossEntropyLoss VS NLLLoss ?
graph LR inputLayer -.-> HiddenLayer HiddenLayer -.-> OutputLayer
# https://github.com/pytorch/examples/blob/master/mnist/main.py from __future__ import print_function import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.autograd import Variable # Training settings batch_size = 16 # MNIST Dataset train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transforms.ToTensor(), download=True) test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transforms.ToTensor()) # Data Loader (Input Pipeline) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.l1 = nn.Linear(784, 520) self.l2 = nn.Linear(520, 320) self.l3 = nn.Linear(320, 240) self.l4 = nn.Linear(240, 120) self.l5 = nn.Linear(120, 10) def forward(self, x): x = x.view(-1, 784) # Flatten the data (n, 1, 28, 28)-> (n, 784) x = F.relu(self.l1(x)) x = F.relu(self.l2(x)) x = F.relu(self.l3(x)) x = F.relu(self.l4(x)) return self.l5(x) model = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() if batch_idx % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) def test(): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: data, target = Variable(data, volatile=True), Variable(target) output = model(data) # sum up batch loss test_loss += criterion(output, target).data[0] # get the index of the max pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) for epoch in range(1, 10): train(epoch) test()
Use DataLoader
Simple convolution layer
for Example:
graph LR 3*3*1_image-->2*2*1_filter_W 3*3*1_image-->1*1_Stride 3*3*1_image-->NoPadding NoPadding-->2*2_featureMap 2*2*1_filter_W-->2*2_featureMap 1*1_Stride-->2*2_featureMap
How to compute multi-dimension pictures ?
w^T + b
Get: 28 * 28 * 1 feature map * N (how many filters you used)
OutputSize = \frac{(InputSize+PaddingSize*2-FilterSize)}{Stride} + 1
filter and padding and filterSize using function above to calculate
activate functions
Max Pooling
還有相似的avg Pooling
self.mp = nn.MaxPool2d(2)
self.fc = nn.Linear(320,10)
Fully Connected network中的神經元是跟每一個像素都相連。
graph TB ConvolutionalLayer1 --> PoolingLayer1 PoolingLayer1 --> ConvolutionalLayer2 ConvolutionalLayer2 --> PoolingLayer2 PoolingLayer2 --> Fully-ConnectedLayer
class Net(nn.Module): def __init__(self): super(Net,self).__init__() self.conv1 = nn.Conv2d(1,10,kernel_size=5) self.conv2 = nn.Conv2d(10,20,kernel_size=5) self.mp = nn.MaxPool2d(2) self.fc = nn.Linear(???,10) def forward(self,x): in_size = x.size(0) x = F.relu(self.mp(self.conv1(x))) x = F.relu(self.mp(self.conv2(x))) x = x.view(in_size,-1) # flatten the tensor x = self.fc(x) return F.log_softmax(x)
Why 1*1 convolution ?
using 32 1*1 filters to turn 64-dimension pic into 32-dimension pic.
using 1*1 filters can significantly save our computations.
graph LR Filter_concat_in --> 1*1Conv0_16 Filter_concat_in --> 1*1Conv1_16 Filter_concat_in --> 1*1Conv2_16 Filter_concat_in --> AvgPooling AvgPooling --> 1*1Conv3_16 1*1Conv0_16 --> 3*3Conv0_24 3*3Conv0_24 --> 3*3Conv1_24 3*3Conv1_24 --> Filter_Concat_out 1*1Conv1_16 --> 5*5Conv_24 5*5Conv_24 --> Filter_Concat_out 1*1Conv3_16 --> Filter_Concat_out 1*1Conv2_16 --> Filter_Concat_out
self.brach1x1 = nn.Conv2d(in_channels,16,kernel_size=1) branch1x1 = self.branch1x1(x)
self.branch_pool = nn.Conv2d(in_channels,24,kernel_size=1) branch_pool = F.avg_pool2d(x,kernel_size=3,stride=1,padding=1) branch_pool = self.branch_pool(branch_pool)
self.branch5x5_1 = nn.Conv2d(in_channels,16,kernel_size=1) self.branch5x5_2 = nn.Conv2d(16,24,kernel_size=1,padding=2) branch5x5 = self.branch5x5_1(x) branch5x5 = self.branch5x5_2(branch5x5)
self.branch3x3_1=nn.Conv2d(in_channels,16,kernel_size=1) self.branch3x3_2=nn.Conv2d(16,24,kernel_size=3,padding=1) self.branch3x3_3=nn.Conv2d(24,24,kernel_size=3,padding=1) branch3x3 = self.branch3x3_1(x) branch3x3 = self.branch3x3_2(branch3x3) branch3x3 = self.branch3x3_3(branch3x3)
outputs = [branch1x1,branch_pool,branch5x5,branch3x3]
# https://github.com/pytorch/examples/blob/master/mnist/main.py from __future__ import print_function import argparse import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.autograd import Variable # Training settings batch_size = 64 # MNIST Dataset train_dataset = datasets.MNIST(root='./data/', train=True, transform=transforms.ToTensor(), download=True) test_dataset = datasets.MNIST(root='./data/', train=False, transform=transforms.ToTensor()) # Data Loader (Input Pipeline) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) class InceptionA(nn.Module): def __init__(self, in_channels): super(InceptionA, self).__init__() self.branch1x1 = nn.Conv2d(in_channels, 16, kernel_size=1) self.branch5x5_1 = nn.Conv2d(in_channels, 16, kernel_size=1) self.branch5x5_2 = nn.Conv2d(16, 24, kernel_size=5, padding=2) self.branch3x3dbl_1 = nn.Conv2d(in_channels, 16, kernel_size=1) self.branch3x3dbl_2 = nn.Conv2d(16, 24, kernel_size=3, padding=1) self.branch3x3dbl_3 = nn.Conv2d(24, 24, kernel_size=3, padding=1) self.branch_pool = nn.Conv2d(in_channels, 24, kernel_size=1) def forward(self, x): branch1x1 = self.branch1x1(x) branch5x5 = self.branch5x5_1(x) branch5x5 = self.branch5x5_2(branch5x5) branch3x3dbl = self.branch3x3dbl_1(x) branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) branch_pool = self.branch_pool(branch_pool) outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] return torch.cat(outputs, 1) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(88, 20, kernel_size=5) self.incept1 = InceptionA(in_channels=10) self.incept2 = InceptionA(in_channels=20) self.mp = nn.MaxPool2d(2) self.fc = nn.Linear(1408, 10) def forward(self, x): in_size = x.size(0) x = F.relu(self.mp(self.conv1(x))) x = self.incept1(x) x = F.relu(self.mp(self.conv2(x))) x = self.incept2(x) x = x.view(in_size, -1) # flatten the tensor x = self.fc(x) return F.log_softmax(x) model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) def test(): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: data, target = Variable(data, volatile=True), Variable(target) output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).data[0] # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) for epoch in range(1, 10): train(epoch) test()
Recurrrent NN
graph LR X1 --> A1 A1 --> h1 X2 --> A2 A2 --> h2 X3 --> A3 A3 --> h3 X4 --> A4 A4 --> h4 A1 --> A2 A2 --> A3 A3 --> A4
different RNN implementations
cell = nn.RNN(input_size=4,hidden_size=2,batch_first=True) cell = nn.GRU(input_size=4,hidden_size=2,batch_first=True) cell = nn.LSTM(input_size=4,hidden_size=2,batch_first=True)
How to use RNN?
cell = nn.RNN(input_size=4,hidden_size=2,batch_first=True) inputs = ... # batch_size, seq_len,inputSize hidden = (...) # numLayers,batch_size, hidden_size out, hidden = cell(inputs,hidden)
有兩個輸出,一個是output, 一個是hidden layer的output
# Lab 12 RNN import sys import torch import torch.nn as nn from torch.autograd import Variable torch.manual_seed(777) # reproducibility # 0 1 2 3 4 idx2char = ['h', 'i', 'e', 'l', 'o'] # Teach hihell -> ihello x_data = [0, 1, 0, 2, 3, 3] # hihell one_hot_lookup = [[1, 0, 0, 0, 0], # 0 [0, 1, 0, 0, 0], # 1 [0, 0, 1, 0, 0], # 2 [0, 0, 0, 1, 0], # 3 [0, 0, 0, 0, 1]] # 4 y_data = [1, 0, 2, 3, 3, 4] # ihello x_one_hot = [one_hot_lookup[x] for x in x_data] # As we have one batch of samples, we will change them to variables only once inputs = Variable(torch.Tensor(x_one_hot)) labels = Variable(torch.LongTensor(y_data)) num_classes = 5 input_size = 5 # one-hot size hidden_size = 5 # output from the RNN. 5 to directly predict one-hot batch_size = 1 # one sentence sequence_length = 1 # One by one num_layers = 1 # one-layer rnn class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True) def forward(self, hidden, x): # Reshape input (batch first) x = x.view(batch_size, sequence_length, input_size) # Propagate input through RNN # Input: (batch, seq_len, input_size) # hidden: (num_layers * num_directions, batch, hidden_size) out, hidden = self.rnn(x, hidden) return hidden, out.view(-1, num_classes) def init_hidden(self): # Initialize hidden and cell states # (num_layers * num_directions, batch, hidden_size) return Variable(torch.zeros(num_layers, batch_size, hidden_size)) # Instantiate RNN model model = Model() print(model) # Set loss and optimizer function # CrossEntropyLoss = LogSoftmax + NLLLoss criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.1) # Train the model for epoch in range(100): optimizer.zero_grad() loss = 0 hidden = model.init_hidden() sys.stdout.write("predicted string: ") for input, label in zip(inputs, labels): # print(input.size(), label.size()) hidden, output = model(hidden, input) val, idx = output.max(1) sys.stdout.write(idx2char[idx.data[0]]) loss += criterion(output, label) print(", epoch: %d, loss: %1.3f" % (epoch + 1, loss.data[0])) loss.backward() optimizer.step() print("Learning finished!")