引入
NIN意爲網絡中的網絡,提出了串聯多個由卷積層和「全鏈接」層構成的小網絡,以此構建一個深層網絡 [ 1 ] \color{red}^{[1]} [1]。html
1 NIN塊
NIN使用 1 × 1 1 \times 1 1×1卷積層來替代全鏈接層,從而使空間信息可以天然傳遞到後面的層中去。
下圖對比了NIN同AlexNet和VGG網絡在結構上的主要區別:
NIN塊是NIN模型中的基礎塊,其特色以下:
1)由一個卷積層加兩個充當全鏈接層的 1 × 1 1 \times 1 1×1卷積層串聯而成;
2)第一個卷積層的超參數能夠自行設置,餘下通常固定。python
""" @author: Inki @contact: inki.yinji@qq.com @version: Created in 2020 1221, last modified in 2020 1221. """ import time import torch import torch.nn as nn from torch import optim from torch.nn import functional from util.SimpleTool import load_data_fashion_mnist, train, FlattenLayer def nin_block(in_channels, out_channels, kernel_size, stride, padding): ret_block = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()) return ret_block
2 NIN模型
模型特色:
1)卷積窗口形狀分別爲 11 × 11 11 \times 11 11×11、 5 × 5 5 \times 5 5×5和 3 × 3 3 \times 3 3×3,輸出通道與AlexNet一致;
2)每一個NIN塊後接一個步幅爲 3 3 3、窗口形狀爲 3 × 3 3 \times 3 3×3的最大池化層;
3)去掉了AlexNet中最後的 3 3 3個全鏈接層,而用輸出通道數等於標籤類別數的NIN塊,而後使用全局平均池化層對每一個通道中全部元素求平均並直接用於分類;
4)全局平均池化層即窗口形狀等於輸入空間形狀的平均池化層:可顯著減少模型參數尺寸,從而緩解過擬合;
5)該設計可能致使訓練時間增長。web
class GlobalAvgPool2d(nn.Module): def __init__(self): super(GlobalAvgPool2d, self).__init__() def forward(self, x): """ The forward function. """ return functional.avg_pool2d(x, kernel_size=x.size()[2:]) def get_net(): ret_net = nn.Sequential(nin_block(1, 96, kernel_size=11, stride=4, padding=0), nn.MaxPool2d(kernel_size=3, stride=2), nin_block(96, 256, kernel_size=5, stride=1, padding=2), nn.MaxPool2d(kernel_size=3, stride=2), nin_block(256, 384, kernel_size=3, stride=1, padding=1), nn.MaxPool2d(kernel_size=3, stride=2), nn.Dropout(0.5), nin_block(384, 10, kernel_size=3, stride=1, padding=1), GlobalAvgPool2d(), FlattenLayer()) return ret_net def test1(): x = torch.rand(1, 1, 224, 224) temp_net = get_net() for name, block in temp_net.named_children(): x = block(x) print(name, 'output shape:', x.shape) if __name__ == '__main__': test1()
輸出以下:網絡
0 output shape: torch.Size([1, 96, 54, 54]) 1 output shape: torch.Size([1, 96, 26, 26]) 2 output shape: torch.Size([1, 256, 26, 26]) 3 output shape: torch.Size([1, 256, 12, 12]) 4 output shape: torch.Size([1, 384, 12, 12]) 5 output shape: torch.Size([1, 384, 5, 5]) 6 output shape: torch.Size([1, 384, 5, 5]) 7 output shape: torch.Size([1, 10, 5, 5]) 8 output shape: torch.Size([1, 10, 1, 1]) 9 output shape: torch.Size([1, 10])
3 模型訓練
def test2(): temp_batch_size = 128 temp_resize = 224 temp_le = 0.002 temp_num_epochs = 5 temp_net = get_net() temp_tr_iter, temp_te_iter = load_data_fashion_mnist(temp_batch_size, resize=temp_resize) temp_optimizer = optim.Adam(temp_net.parameters(), lr=temp_le) train(temp_net, temp_tr_iter, temp_te_iter, temp_batch_size, temp_optimizer, num_epochs=temp_num_epochs) if __name__ == '__main__': test2()
完整代碼
""" @author: Inki @contact: inki.yinji@qq.com @version: Created in 2020 1221, last modified in 2020 1221. """ import time import torch import torch.nn as nn from torch import optim from torch.nn import functional from util.SimpleTool import load_data_fashion_mnist, train, FlattenLayer def nin_block(in_channels, out_channels, kernel_size, stride, padding): ret_block = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()) return ret_block class GlobalAvgPool2d(nn.Module): def __init__(self): super(GlobalAvgPool2d, self).__init__() def forward(self, x): """ The forward function. """ return functional.avg_pool2d(x, kernel_size=x.size()[2:]) def get_net(): ret_net = nn.Sequential(nin_block(1, 96, kernel_size=11, stride=4, padding=0), nn.MaxPool2d(kernel_size=3, stride=2), nin_block(96, 256, kernel_size=5, stride=1, padding=2), nn.MaxPool2d(kernel_size=3, stride=2), nin_block(256, 384, kernel_size=3, stride=1, padding=1), nn.MaxPool2d(kernel_size=3, stride=2), nn.Dropout(0.5), nin_block(384, 10, kernel_size=3, stride=1, padding=1), GlobalAvgPool2d(), FlattenLayer()) return ret_net def test1(): x = torch.rand(1, 1, 224, 224) temp_net = get_net() for name, block in temp_net.named_children(): x = block(x) print(name, 'output shape:', x.shape) def test2(): temp_batch_size = 128 temp_resize = 224 temp_le = 0.002 temp_num_epochs = 5 temp_net = get_net() temp_tr_iter, temp_te_iter = load_data_fashion_mnist(temp_batch_size, resize=temp_resize) temp_optimizer = optim.Adam(temp_net.parameters(), lr=temp_le) train(temp_net, temp_tr_iter, temp_te_iter, temp_batch_size, temp_optimizer, num_epochs=temp_num_epochs) if __name__ == '__main__': test2()
參考庫
util.SimpleTool
""" @author: Inki @contact: inki.yinji@qq.com @version: Created in 2020 0903, last modified in 2020 1221. @note: Some common function, and all given vector data's type must be numpy.array. """ import time import numpy as np import sys import scipy.io as scio import torch import torchvision.transforms as transforms import torchvision from torch import nn from multiprocessing import cpu_count def get_iter(tr, tr_lab, te, te_lab): """ Get iterator. :param tr: The training set. tr_lab: The training set's label. te: The test set. te_lab: The test set's label. """ yield tr, tr_lab, te, te_lab def is_print(para_str, para_is_print=True): """ Is print? :param para_str: The print string. para_is_print: True print else not. """ if para_is_print: print(para_str) def load_file(para_path): """ Load file. :param para_file_name: The path of the given file. :return The data. """ temp_type = para_path.split('.')[-1] if temp_type == 'mat': ret_data = scio.loadmat(para_path) return ret_data['data'] else: with open(para_path) as temp_fd: ret_data = temp_fd.readlines() return ret_data def load_data_fashion_mnist(batch_size=10, root='D:/Data/Datasets/FashionMNIST', resize=None): """ Download the fashion mnist dataset and then load into memory. """ trans = [] if resize: trans.append(transforms.Resize(size=resize)) trans.append(transforms.ToTensor()) transform = transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform) mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform) if sys.platform.startswith('win'): num_workers = 0 else: num_workers = cpu_count() train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_iter, test_iter def owa_weight(para_num, para_type='linear_decrease'): """ The ordered weighted averaging operators (OWA) can replace the maximum or minimum operators. And the purpose of this function is to generate the owa weights. And the more refer is: R. R. Yager, J. Kacprzyk, The ordered weighted averaging operators: Theory and applications, Springer Science & Business Media, 2012. :param para_num: The length of weights list. para_type: 'linear_decrease'; 'inverse_additive', and its default setting is 'linear_decrease'. :return The owa weights. """ if para_num == 1: return np.array([1]) else: if para_type == 'linear_decrease': temp_num = 2 / para_num / (para_num + 1) return np.array([(para_num - i) * temp_num for i in range(para_num)]) elif para_type == 'inverse_additive': temp_num = np.sum([1 / i for i in range(1, para_num + 1)]) return np.array([1 / i / temp_num for i in range(1, para_num + 1)]) else: return owa_weight(para_num) def print_go_round(para_idx, para_str='Program processing'): """ Print the round. :param para_idx: The current index. para_str: The print words. """ round_list = ["\\", "|", "/", "-"] print('\r' + para_str + ': ' + round_list[para_idx % 4], end="") def print_progress_bar(para_idx, para_len): """ Print the progress bar. :param para_idx: The current index. para_len: The loop length. """ print('\r' + '▇' * int(para_idx // (para_len / 50)) + str(np.ceil((para_idx + 1) * 100 / para_len)) + '%', end='') def train(net, tr_iter, te_iter, batch_size, optimizer, loss=nn.CrossEntropyLoss(), device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), num_epochs=100): """ The train function. """ net = net.to(device) temp_batch_count = 0 print("Training on", device) for epoch in range(num_epochs): temp_tr_loss_sum, temp_tr_acc_sum, temp_num, temp_start_time = 0., 0., 0, time.time() for x, y in tr_iter: x = x.to(device) y = y.to(device) temp_y_pred = net(x) temp_loss = loss(temp_y_pred, y) optimizer.zero_grad() temp_loss.backward() optimizer.step() temp_tr_loss_sum += temp_loss.cpu().item() temp_tr_acc_sum += (temp_y_pred.argmax(dim=1) == y).sum().cpu().item() temp_num += y.shape[0] temp_batch_count += 1 test_acc = evaluate_accuracy(te_iter, net) print("Epoch %d, loss %.4f, training acc %.3f, test ass %.3f, time %.1f s" % (epoch + 1, temp_tr_loss_sum / temp_batch_count, temp_tr_acc_sum / temp_num, test_acc, time.time() - temp_start_time)) def evaluate_accuracy(data_iter, net, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')): """ The evaluate function, and the performance measure is accuracy. """ ret_acc, temp_num = 0., 0 with torch.no_grad(): for x, y in data_iter: net.eval() # The evaluate mode, and the dropout is closed. ret_acc += (net(x.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item() net.train() temp_num += y.shape[0] return ret_acc / temp_num class Count(dict): """ The count class with dict. """ def __missing__(self, __key): return 0 class FlattenLayer(torch.nn.Module): def __init__(self): super(FlattenLayer, self).__init__() def forward(self, x): return x.view(x.shape[0], -1) if __name__ == '__main__': load_data_fashion_mnist()
【1】李沐、Aston Zhang等老師的這本《動手學深度學習》一書。app
本文同步分享在 博客「因吉」(CSDN)。
若有侵權,請聯繫 support@oschina.cn 刪除。
本文參與「OSC源創計劃」,歡迎正在閱讀的你也加入,一塊兒分享。ide