1. 轉置卷積
讓咱們考慮一個基本狀況,輸入和輸出通道均爲1,填充爲0,跨度爲1。下圖說明了轉置卷積如何經過 2 × 2 2×2 2×2內核是根據 2 × 2 2×2 2×2輸入矩陣獲得 3 x 3 3x3 3x3的輸出python
def trans_conv(X, K): h, w = K.shape Y = np.zeros((X.shape[0] + h - 1, X.shape[1] + w - 1)) for i in range(X.shape[0]): for j in range(X.shape[1]): Y[i: i + h, j: j + w] += X[i, j] * K return Y X = np.array([[0, 1], [2, 3]]) K = np.array([[0, 1], [2, 3]]) trans_conv(X, K)
使用gluon的nn.Conv2DTranspose以得到相同的結果。如 nn.Conv2D,輸入和內核均應爲4D張量。github
X, K = X.reshape(1, 1, 2, 2), K.reshape(1, 1, 2, 2) tconv = nn.Conv2DTranspose(1, kernel_size=2) tconv.initialize(init.Constant(K)) tconv(X)
1.1 填充,步幅和通道設置
咱們將填充元素應用於卷積中的輸入,而將它們應用於轉置卷積中的輸出。一種 1 × 1 1×1 1×1 padding表示咱們首先按正常方式計算輸出,而後刪除第一行/最後一行。網絡
tconv = nn.Conv2DTranspose(1, kernel_size=2, padding=1) tconv.initialize(init.Constant(K)) tconv(X) # array([[[[4.]]]])
tconv = nn.Conv2DTranspose(1, kernel_size=2, strides=2) tconv.initialize(init.Constant(K)) tconv(X)
X = np.random.uniform(size=(1, 10, 16, 16)) conv = nn.Conv2D(20, kernel_size=5, padding=2, strides=3) tconv = nn.Conv2DTranspose(10, kernel_size=5, padding=2, strides=3) conv.initialize() tconv.initialize() tconv(conv(X)).shape == X.shape # True
2. 全卷積網絡(FCN)
2.1 建立模型
全卷積網絡首先使用卷積神經網絡來提取圖像特徵,而後經過1×1 卷積層將通道數轉換爲類別數。 最後經過使用轉置的卷積層將特徵圖的高度和寬度轉換爲輸入圖像的大小 。模型輸出與輸入圖像具備相同的高度和寬度,而且在空間位置上具備一一對應的關係。最終輸出通道包含相應空間位置的像素的類別預測。函數
下面使用在ImageNet上預訓練的ResNet-18模型進行微調。模型成員變量的最後兩層features是全局平均池化層 GlobalAvgPool2D和示例扁平化層Flatten。該 output模塊包含用於輸出的徹底鏈接層。徹底卷積網絡不須要這些層。測試
pretrained_net = gluon.model_zoo.vision.resnet18_v2(pretrained=True) pretrained_net.features[-4:], pretrained_net.output
net = nn.HybridSequential() for layer in pretrained_net.features[:-2]: net.add(layer)
X = np.random.uniform(size=(1, 3, 320, 480)) net(X).shape # (1, 512, 10, 15)
接下來須要經過 1 × 1 1×1 1×1卷積層將通道數輸出爲數據的類別數量,這裏Pascal VOC2012的種類爲21。而且經過轉置卷積層將寬高放大爲原來的32倍。只要將步幅設置爲32,並將padding設置爲 32 / 2 = 16 32/2=16 32/2=16,便可達到方法32倍的效果,將kernel設置爲 64 × 64 64×64 64×64
num_classes = 21 net.add( nn.Conv2D(num_classes, kernel_size=1), nn.Conv2DTranspose(num_classes, kernel_size=64, padding=16, strides=32) )
2.2 初始化轉置卷積層
咱們已經知道轉置的卷積層能夠放大特徵圖。在圖像處理中,有時咱們須要放大圖像,即上採樣。上採樣的方法不少,一種常見的方法是雙線性插值。簡單來講, 爲了得到輸出圖像的像素座標 ( x , y ) (x, y) (x,y), 首先將座標映射到輸入圖像的座標 ( x ′ , y ′ ) (x', y') (x′,y′)。而後在輸入圖像上找到4個最接近 ( x ′ , y ′ ) (x', y') (x′,y′)的座標,而後經過 ( x ′ , y ′ ) (x', y') (x′,y′)和它附近的四個像素的相對距離計算 ( x , y ) (x, y) (x,y) 。下面構建一個函數,經過雙線插值進行上採樣。
def bilinear_kernel(in_channels, out_channels, kernel_size): factor = (kernel_size + 1) // 2 if kernel_size % 2 == 1: center = factor - 1 else: center = factor - 0.5 og = (np.arange(kernel_size).reshape(-1, 1), np.arange(kernel_size).reshape(1, -1)) filt = (1 - np.abs(og[0] - center) / factor) * (1 - np.abs(og[1] - center) / factor) weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size)) weight[range(in_channels), range(out_channels), :, :] = filt return np.array(weight)
conv_trans = nn.Conv2DTranspose(3, kernel_size=4, padding=1, strides=2) conv_trans.initialize(init.Constant(bilinear_kernel(3, 3, 4)))
img = image.imread('img/catdog.jpg') X = np.expand_dims(img.astype('float32').transpose(2, 0, 1), axis=0)/255 Y = conv_trans(X) out_img = Y[0].transpose(1, 2, 0) print('輸入圖片形狀:', img.shape) print('處理過得輸出形狀:', out_img.shape) px.imshow(out_img.asnumpy(), width=img.shape[1]/2, height=img.shape[0]/2)
初始化轉置卷積層和 1 × 1 1×1 1×1 卷積層
W = bilinear_kernel(num_classes, num_classes, 64) net[-1].initialize(init.Constant(W)) net[-2].initialize(init=init.Xavier())
3. 訓練
def accuracy(y_hat, y): if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = y_hat.argmax(axis=1) cmp = y_hat.astype(y.dtype) == y return float(cmp.sum()) def train_batch(net, features, labels, loss, trainer, devices, split_f=d2l.split_batch): X_shards, y_shards = split_f(features, labels, devices) with autograd.record(): pred_shards = [net(X_shard) for X_shard in X_shards] ls = [loss(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)] for l in ls: l.backward() # ignore_stale_grad表明能夠使用就得梯度參數 trainer.step(labels.shape[0], ignore_stale_grad=True) train_loss_sum = sum([float(l.sum()) for l in ls]) train_acc_sum = sum(accuracy(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)) return train_loss_sum, train_acc_sum def train(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), split_f=d2l.split_batch): num_batches, timer = len(train_iter), d2l.Timer() epochs_lst, loss_lst, train_acc_lst, test_acc_lst = [],[],[],[] for epoch in range(num_epochs): metric = d2l.Accumulator(4) for i, (features, labels) in enumerate(train_iter): timer.start() l, acc = train_batch( net, features, labels, loss, trainer, devices, split_f) metric.add(l, acc, labels.shape[0], labels.size) timer.stop() if (i + 1) % (num_batches // 5) == 0: epochs_lst.append(epoch + i / num_batches) loss_lst.append(metric[0] / metric[2]) train_acc_lst.append(metric[1] / metric[3]) test_acc_lst.append(d2l.evaluate_accuracy_gpus(net, test_iter, split_f)) print(f"[epock {epoch+1}] train loss: {metric[0] / metric[2]:.3f} train acc: {metric[1] / metric[3]:.3f}", f" test_loss: {test_acc_lst[-1]:.3f}") print(f'loss {metric[0] / metric[2]:.3f}, train acc ' f'{metric[1] / metric[3]:.3f}, test acc {test_acc_lst[-1]:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on ' f'{str(devices)}') fig = go.Figure() fig.add_trace(go.Scatter(x=epochs_lst, y=loss_lst, name='train loss')) fig.add_trace(go.Scatter(x=epochs_lst, y=train_acc_lst, name='train acc')) fig.add_trace(go.Scatter(x=list(range(1,len(test_acc_lst)+1)), y=test_acc_lst, name='test acc')) fig.update_layout(width=800, height=480, xaxis_title='epoch', yaxis_range=[0, 1]) fig.show()
batch_size = 16 train_iter, test_iter = load_data_voc(batch_size, crop_size)
num_epochs, lr, wd, devices = 5, 0.1, 1e-3, [npx.gpu()] loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=1) net.collect_params().reset_ctx(devices) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr, 'wd': wd}) train(net, train_iter, test_iter, loss, trainer, num_epochs, devices)
def predict(img): X = test_iter._dataset.normalize_image(img) X = np.expand_dims(X.transpose(2, 0, 1), axis=0) pred = net(X.as_in_ctx(devices[0])).argmax(axis=1) return pred.reshape(pred.shape[1], pred.shape[2]) def label2image(pred): colormap = VOC_COLORMAP.as_in_ctx(devices[0]) X = pred.astype('int32') return colormap[X, :]
test_images, test_labels = d2l.read_voc_images(voc_dir, False) n, imgs = 4, [] for i in range(n): crop_rect = (0, 0, 480, 320) X = image.fixed_crop(test_images[i], *crop_rect) pred = label2image(predict(X)) imgs += [X, pred, image.fixed_crop(test_labels[i], *crop_rect)] Image(show_imgs(imgs[::3] + imgs[1::3] + imgs[2::3], 3, n, scale=1.5))