1. 批量歸一化
- 數據預處理一般會對結果有巨大的影響,一般須要將咱們的數據進行特徵標準化,讓特徵平均值爲0,方差爲1。.
- 在CNN模型中,中間層的變量可能會採用變化很大的值,可能會阻礙網絡的收斂,若是一層的變量值是另外一層的100倍,那麼須要對學習率進行調整。
- 更深的網絡更復雜,很容易過擬合,所以正則化顯得更加關鍵
2.1 全鏈接層
咱們將批量歸一化層置於全鏈接層中的仿射變換和激活函數之間。設全鏈接層的輸入爲 u u u ,權重參數和誤差參數分別爲 W W W 和 b b b ,激活函數爲 ϕ \phi ϕ 。設批量歸一化的運算符爲 B N BN BN 。那麼,使用批量歸一化的全鏈接層的輸出爲github
h = ϕ ( B N ( W x + b ) ) \mathbf{h} = \phi(\mathrm{BN}(\mathbf{W}\mathbf{x} + \mathbf{b}) ) h=ϕ(BN(Wx+b))express
2.2 卷積層
對卷積層來講,批量歸一化發生在卷積計算以後、應用激活函數以前。若是卷積計算輸出多個通道,咱們須要對這些通道的輸出分別作批量歸一化,且每一個通道都擁有獨立的拉伸和偏移參數,並均爲標量。設小批量中有 m m m 個樣本。在單個通道上,假設卷積計算輸出的高和寬分別爲 p p p 和 q q q 。咱們須要對該通道中 m × p × q m×p×q m×p×q 個元素同時作批量歸一化。對這些元素作標準化計算時,咱們使用相同的均值和方差,即該通道中 m × p × q m×p×q m×p×q 個元素的均值和方差。windows
2.3 預測時的批歸一化
3. 從零建立批歸一化
from mxnet import autograd, np, npx, init, gluon from mxnet.gluon import nn import plotly.graph_objs as go npx.set_np() ctx = npx.gpu() if npx.num_gpus() else npx.cpu() def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): # 區分是訓練仍是預測 if not autograd.is_training(): # 預測模式下,直接使用傳入的移動平均所得的均值和方差 X_hat = (X - moving_mean)/np.sqrt(moving_var + eps) else: assert len(X.shape) in (2,4) # 全連接層2, 卷積4 if len(X.shape) == 2: # 使用全鏈接層的狀況,計算特徵維上的均值和方差 mean = X.mean(axis = 0) var = ((X-mean)**2).mean(axis=0) else: # 二維卷積的狀況,計算通道緯度上均值和方差,須要保持X的形狀 mean = X.mean(axis = (0 ,2, 3), keepdims=True) var = ((X-mean)**2).mean(axis=(0,2,3), keepdims=True) X_hat = (X-mean)/np.sqrt(var+eps) moving_mean = momentum*moving_mean+(1.0-momentum)*mean moving_var = momentum*moving_var+(1.0-momentum)*var Y = gamma*X_hat + beta return Y, moving_mean, moving_var
class BatchNorm(nn.Block): def __init__(self, num_features, num_dims, **kwargs): super().__init__(**kwargs) if num_dims == 2: shape = (1, num_features) else: shape = (1, num_features, 1, 1) #參與求梯度和迭代的拉伸和偏移參數,分別初始化成1和0 self.gamma = self.params.get('gamma', shape=shape, init = init.One()) self.beta = self.params.get('beta', shape=shape, init = init.Zero()) self.moving_mean = np.zeros(shape) self.moving_var = np.zeros(shape) def forward(self, X): # 若是X不在內存上,將moving_mean和moving_var複製到X所在顯存上 if self.moving_mean.ctx != X.ctx: self.moving_mean = self.moving_mean.as_in_ctx(X.ctx) self.moving_var = self.moving_var.as_in_ctx(X.ctx) # 保存更新過的moving_mean和moving_var Y, self.moving_mean, self.moving_var = batch_norm( X, self.gamma.data(), self.beta.data(), self.moving_mean, self.moving_var, eps=1e-12, momentum=0.9) return Y
3.1 在LeNet中使用批歸一化層
net = nn.Sequential() net.add( nn.Conv2D(6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Dense(120), BatchNorm(120, num_dims=2), nn.Activation('sigmoid'), nn.Dense(84), BatchNorm(84, num_dims=2), nn.Activation('sigmoid'), nn.Dense(10) )
3.2 訓練
def get_workers(num): # windows系統不能使用多線程轉換 return 0 if __import__('sys').platform.startswith('win') else num def loader(data, batch_size, shuffle=True, workers = 6): return gluon.data.DataLoader(data,batch_size, shuffle=shuffle, num_workers=get_workers(workers)) def load_data(batch_size, resize=None): dataset = gluon.data.vision trans = [dataset.transforms.Resize(resize)] if resize else [] trans.append(dataset.transforms.ToTensor()) trans = dataset.transforms.Compose(trans) mnist_train = dataset.FashionMNIST(train=True).transform_first(trans) mnist_test = dataset.FashionMNIST(train=False).transform_first(trans) return loader(mnist_train, batch_size), loader(mnist_test, batch_size, False) def accuracy(y_hat, y): if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = y_hat.argmax(axis=1) cmp = y_hat.astype(y.dtype) == y return float(cmp.sum()) def train_epoch(net, train_iter, loss, updater): l_sum = acc_rate = total = 0 if isinstance(updater, gluon.Trainer): updater = updater.step for X,y in train_iter: X = X.as_in_ctx(ctx) y = y.as_in_ctx(ctx) with autograd.record(): pre_y = net(X) l = loss(pre_y, y) l.backward() updater(y.size) l_sum += float(l.sum()) acc_rate += accuracy(pre_y, y) total += y.size return l_sum/total, acc_rate/total def evaluate_accuracy(net, data_iter): match_num = total_num = 0 for X, y in data_iter: X = X.as_in_ctx(ctx) y = y.as_in_ctx(ctx) match_num += accuracy(net(X), y) total_num += y.size return match_num / total_num import time def train(net, train_iter, test_iter, epochs, lr): net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) loss = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr}) l_lst, acc_lst, test_acc_lst = [], [], [] timer = 0 print("----------------start------------------") for epoch in range(epochs): start = time.time() l, acc = train_epoch(net, train_iter, loss, trainer) timer += time.time()-start test_acc = evaluate_accuracy(net, test_iter) print(f'[epoch {epoch+1}] loss {l:.3f}, train acc {acc:.3f}, ' f'test acc {test_acc:.3f}') l_lst.append(l) acc_lst.append(acc) test_acc_lst.append(test_acc) print(f'loss {l:.3f}, train acc {acc:.3f}, test acc {test_acc:.3f}') print(f'{timer:.1f} sec, on {str(ctx)}') draw_graph([l_lst, acc_lst, test_acc_lst]) def draw_graph(result): data = [] colors = ['aquamarine', 'orange', 'hotpink'] names = ['train loss', 'train acc', 'test acc'] symbols = ['circle-open', 'cross-open', 'triangle-up-open'] for i, info in enumerate(result): trace = go.Scatter( x = list(range(1, num_epochs+1)), y = info, mode = 'lines+markers', name = names[i], marker = { 'color':colors[i], 'symbol':symbols[i], }, ) data.append(trace) fig = go.Figure(data = data) fig.update_layout(xaxis_title='epochs', width=800, height=480) fig.show()
- 依舊運行10epochs
lr, num_epochs, batch_size = 1.0, 10, 256 train_iter, test_iter = load_data(batch_size) train(net, train_iter, test_iter, num_epochs, lr)
- 第一批歸一化層學到的scale參數gamma和shift參數beta。
4. 使用api簡化
NeLet = nn.Sequential() NeLet.add(nn.Conv2D(6, kernel_size=5), nn.BatchNorm(), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(16, kernel_size=5), nn.BatchNorm(), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Dense(120), nn.BatchNorm(), nn.Activation('sigmoid'), nn.Dense(84), nn.BatchNorm(), nn.Activation('sigmoid'), nn.Dense(10))
5. 預測
import plotly.express as px from plotly.subplots import make_subplots def get_fashion_mnist_labels(labels): text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels] def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): colorscales = px.colors.named_colorscales() fig = make_subplots(num_rows, num_cols, subplot_titles=titles) for i, img in enumerate(imgs): fig.add_trace(go.Heatmap(z=img.asnumpy()[::-1], showscale=False, colorscale=colorscales[i+3]), 1, i+1) fig.update_xaxes(visible=False,row=1, col=i+1) fig.update_yaxes(visible=False, row=1, col=i+1) fig.update_layout(height=270) fig.show() def predict(net, test_iter, stop, n=8): for i,(X,y) in enumerate(test_iter): if (i==stop) : break X,y = X.as_in_ctx(ctx), y.as_in_ctx(ctx) trues = get_fashion_mnist_labels(y) preds = get_fashion_mnist_labels(net(X).argmax(axis=1)) titles = [f"true: {t} <br> pre: {p}" for t, p in zip(trues, preds)] show_images(X[:n].reshape((-1, 28, 28)), 1, n, titles=titles[:n]) import random stop = random.choice(range(10)) predict(NeLet, test_iter, stop)
