1. 批量歸一化

訓練深度神經網絡很困難，存在一些挑戰，須要引入批量歸一化來處理這些這些挑戰：html

數據預處理一般會對結果有巨大的影響，一般須要將咱們的數據進行特徵標準化，讓特徵平均值爲0,方差爲1。.
在CNN模型中，中間層的變量可能會採用變化很大的值，可能會阻礙網絡的收斂，若是一層的變量值是另外一層的100倍，那麼須要對學習率進行調整。
更深的網絡更復雜，很容易過擬合，所以正則化顯得更加關鍵

在模型訓練時，批量歸一化利用小批量上的均值和標準差，不斷調整神經網絡中間輸出，從而使整個神經網絡在各層的中間輸出的數值更穩定。批量歸一化和下一節將要介紹的殘差網絡爲訓練和設計深度模型提供了兩類重要思路。python

2.批量歸一化層

全鏈接層和卷積層的批處理規範化實現略有不一樣，這裏分別解釋。git

2.1 全鏈接層

咱們將批量歸一化層置於全鏈接層中的仿射變換和激活函數之間。設全鏈接層的輸入爲 u u u ，權重參數和誤差參數分別爲 W W W 和 b b b ，激活函數爲 ϕ \phi ϕ 。設批量歸一化的運算符爲 B N BN BN 。那麼，使用批量歸一化的全鏈接層的輸出爲github

h = ϕ ( B N ( W x + b ) ) \mathbf{h} = \phi(\mathrm{BN}(\mathbf{W}\mathbf{x} + \mathbf{b}) ) h=ϕ(BN(Wx+b))express

2.2 卷積層

對卷積層來講，批量歸一化發生在卷積計算以後、應用激活函數以前。若是卷積計算輸出多個通道，咱們須要對這些通道的輸出分別作批量歸一化，且每一個通道都擁有獨立的拉伸和偏移參數，並均爲標量。設小批量中有 m m m 個樣本。在單個通道上，假設卷積計算輸出的高和寬分別爲 p p p 和 q q q 。咱們須要對該通道中 m × p × q m×p×q m×p×q 個元素同時作批量歸一化。對這些元素作標準化計算時，咱們使用相同的均值和方差，即該通道中 m × p × q m×p×q m×p×q 個元素的均值和方差。windows

2.3 預測時的批歸一化

在預測時，咱們但願模型對於任意輸入都有肯定的輸出。單個樣本的輸出若是取決於小批量中的均值和方差，能夠經過得到整個訓練數據集的均值和方差。可見，批量歸一化層在訓練模式和預測模式下的計算結果也是同樣的。api

3. 從零建立批歸一化

咱們從頭開始使用張量實現批處理歸一化層。網絡

from mxnet import autograd, np, npx, init, gluon
from mxnet.gluon import nn
import plotly.graph_objs as go
npx.set_np()

ctx = npx.gpu() if npx.num_gpus() else npx.cpu()

def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 區分是訓練仍是預測
    
    if not autograd.is_training():
        # 預測模式下，直接使用傳入的移動平均所得的均值和方差
        X_hat = (X - moving_mean)/np.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2,4) # 全連接層2, 卷積4
        if len(X.shape) == 2:
            # 使用全鏈接層的狀況，計算特徵維上的均值和方差
            mean = X.mean(axis = 0)
            var = ((X-mean)**2).mean(axis=0)
        else:
            # 二維卷積的狀況，計算通道緯度上均值和方差，須要保持X的形狀
            mean = X.mean(axis = (0 ,2, 3), keepdims=True)
            var = ((X-mean)**2).mean(axis=(0,2,3), keepdims=True)
        X_hat = (X-mean)/np.sqrt(var+eps)
        moving_mean = momentum*moving_mean+(1.0-momentum)*mean
        moving_var = momentum*moving_var+(1.0-momentum)*var
    Y = gamma*X_hat + beta
    return Y, moving_mean, moving_var

同時獲取移動平均值和移動方差，由於測試的時候要用。多線程

建立一個BatchNorm實例，num_features參數對於全鏈接層來講應爲輸出個數，對於卷積層來講則爲輸出通道數。num_dims參數對於全鏈接層和卷積層來講分別爲2和4。app

class BatchNorm(nn.Block):
    
    def __init__(self, num_features, num_dims, **kwargs):
        super().__init__(**kwargs)
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        #參與求梯度和迭代的拉伸和偏移參數，分別初始化成1和0
        self.gamma = self.params.get('gamma', shape=shape, init = init.One())
        self.beta = self.params.get('beta', shape=shape, init = init.Zero())
        
        self.moving_mean = np.zeros(shape)
        self.moving_var = np.zeros(shape)
        
    def forward(self, X):
        # 若是X不在內存上，將moving_mean和moving_var複製到X所在顯存上
        if self.moving_mean.ctx != X.ctx:
            self.moving_mean = self.moving_mean.as_in_ctx(X.ctx)
            self.moving_var = self.moving_var.as_in_ctx(X.ctx)
        # 保存更新過的moving_mean和moving_var
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma.data(), self.beta.data(), self.moving_mean,
            self.moving_var, eps=1e-12, momentum=0.9)
        return Y

3.1 在LeNet中使用批歸一化層

在全部的卷積層或全鏈接層以後、激活層以前加入批量歸一化層。

net = nn.Sequential()
net.add(
    nn.Conv2D(6, kernel_size=5),
    BatchNorm(6, num_dims=4),
    nn.Activation('sigmoid'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Conv2D(16, kernel_size=5),
    BatchNorm(16, num_dims=4),
    nn.Activation('sigmoid'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Dense(120),
    BatchNorm(120, num_dims=2),
    nn.Activation('sigmoid'),
    nn.Dense(84),
    BatchNorm(84, num_dims=2),
    nn.Activation('sigmoid'),
    nn.Dense(10)
)

3.2 訓練

將在Fashion-MNIST數據集上訓練咱們的網絡。

def get_workers(num):
    # windows系統不能使用多線程轉換
    return 0 if __import__('sys').platform.startswith('win') else num

def loader(data, batch_size, shuffle=True, workers = 6):
    return gluon.data.DataLoader(data,batch_size, shuffle=shuffle,
                                   num_workers=get_workers(workers))

def load_data(batch_size, resize=None):
    
    dataset = gluon.data.vision
    trans = [dataset.transforms.Resize(resize)] if resize else []
    trans.append(dataset.transforms.ToTensor())
    trans = dataset.transforms.Compose(trans)
    mnist_train = dataset.FashionMNIST(train=True).transform_first(trans)
    mnist_test = dataset.FashionMNIST(train=False).transform_first(trans)
    return loader(mnist_train, batch_size), loader(mnist_test, batch_size, False)    


def accuracy(y_hat, y): 
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.astype(y.dtype) == y
    return float(cmp.sum())

def train_epoch(net, train_iter, loss, updater):
    
    l_sum = acc_rate = total = 0
    
    if isinstance(updater, gluon.Trainer):
        updater = updater.step
        
    for X,y in train_iter:
        X = X.as_in_ctx(ctx)
        y = y.as_in_ctx(ctx)
        with autograd.record():
            pre_y = net(X)
            l = loss(pre_y, y)
        l.backward()
        updater(y.size)
        l_sum += float(l.sum())
        acc_rate += accuracy(pre_y, y)
        total += y.size
    return l_sum/total, acc_rate/total

def evaluate_accuracy(net, data_iter):  

    match_num = total_num = 0
    for X, y in data_iter:
        X = X.as_in_ctx(ctx)
        y = y.as_in_ctx(ctx)
        match_num += accuracy(net(X), y)
        total_num += y.size
    return match_num / total_num

import time
def train(net, train_iter, test_iter, epochs, lr):
    
    net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(net.collect_params(), 'sgd',  { 'learning_rate': lr})
    l_lst, acc_lst, test_acc_lst = [], [], []
    timer = 0
    print("----------------start------------------")
    for epoch in range(epochs):
        start = time.time()
        l, acc = train_epoch(net, train_iter, loss, trainer)
        timer += time.time()-start
        test_acc = evaluate_accuracy(net, test_iter)
        print(f'[epoch {epoch+1}] loss {l:.3f}, train acc {acc:.3f}, ' f'test acc {test_acc:.3f}')
        l_lst.append(l)
        acc_lst.append(acc)
        test_acc_lst.append(test_acc)
    print(f'loss {l:.3f}, train acc {acc:.3f}, test acc {test_acc:.3f}')
    print(f'{timer:.1f} sec, on {str(ctx)}')
    draw_graph([l_lst, acc_lst, test_acc_lst])
    

def draw_graph(result):
    data = []
    colors = ['aquamarine', 'orange', 'hotpink']
    names = ['train loss', 'train acc', 'test acc']
    symbols = ['circle-open', 'cross-open', 'triangle-up-open']
    for i, info in enumerate(result):
        trace = go.Scatter(
            x = list(range(1, num_epochs+1)),
            y = info,
            mode = 'lines+markers',
            name = names[i],
            marker = { 
                'color':colors[i],
                'symbol':symbols[i],
            },
        )
        data.append(trace)
    fig = go.Figure(data = data)
    fig.update_layout(xaxis_title='epochs', width=800, height=480)
    fig.show()

依舊運行10epochs

lr, num_epochs, batch_size = 1.0, 10, 256
train_iter, test_iter = load_data(batch_size)
train(net, train_iter, test_iter, num_epochs, lr)

第一批歸一化層學到的scale參數gamma和shift參數beta。

4. 使用api簡化

與BatchNorm咱們本身定義的類相比，咱們能夠直接使用BatchNorm深度學習框架中高級API中定義的類。該代碼實際上與咱們上面實現的應用程序相同。

NeLet = nn.Sequential()
NeLet.add(nn.Conv2D(6, kernel_size=5),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Dense(120),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.Dense(84),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.Dense(10))

5. 預測

訓練完成的模型經過輸入一些數據進行預測，試試效果

import plotly.express as px
from plotly.subplots import make_subplots
def get_fashion_mnist_labels(labels): 
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): 
    colorscales = px.colors.named_colorscales()
    fig = make_subplots(num_rows, num_cols, subplot_titles=titles)
    for i, img in enumerate(imgs):
        fig.add_trace(go.Heatmap(z=img.asnumpy()[::-1], showscale=False, colorscale=colorscales[i+3]), 1, i+1)
        fig.update_xaxes(visible=False,row=1, col=i+1)
        fig.update_yaxes(visible=False, row=1, col=i+1)
    fig.update_layout(height=270)
    fig.show()

def predict(net, test_iter, stop, n=8):
    for i,(X,y) in enumerate(test_iter):
        if (i==stop) :
            break
    X,y = X.as_in_ctx(ctx), y.as_in_ctx(ctx)
    trues = get_fashion_mnist_labels(y)
    preds = get_fashion_mnist_labels(net(X).argmax(axis=1))
    titles = [f"true: {t} <br> pre: {p}" for t, p in zip(trues, preds)]
    show_images(X[:n].reshape((-1, 28, 28)), 1, n, titles=titles[:n])

import random
stop = random.choice(range(10))
predict(NeLet, test_iter, stop)

6. 參考

https://d2l.ai/chapter_convolutional-modern/batch-norm.html

https://plotly.com/python/subplots/

7.代碼

github

Mxnet (14): 神經網批量歸一化