CNN反向傳播算法公式

時間 2020-06-29

原文原文鏈接

網絡結構（6c-2s-12c-2s）：網絡

初始化：dom

\begin{align}\notag W \sim U(- \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}} , \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}}) \end{align} 函數

\begin{align}\notag Var(W_i) = \frac{1}{n_i} ; Var(W_i) = \frac{1}{n_{i+1}} ; Var(W_i) = \frac{1}{n_i + n_{i+1}} \end{align} this

偏置 $ b $ 統一初始化爲 $ 0 $ ，權重 $ W $ 設置爲 $ random(-1,1)\sqrt{\frac{6}{fan_{in} + fan_{out}}} \sim U(- \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}} , \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}}) $ ， $ n_j $ 表示神經網絡的大小， $ fan_{in} = 輸入通道數\times卷積核size $ ， $ fan_{out} = 輸出通道數\times卷積核size $ 。code

for l = 1 : numel(net.layers)   %  layer
        if strcmp(net.layers{l}.type, 's')
            mapsize = mapsize / net.layers{l}.scale;
            assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]);
            for j = 1 : inputmaps
                net.layers{l}.b{j} = 0;
            end
        end
        if strcmp(net.layers{l}.type, 'c')
            mapsize = mapsize - net.layers{l}.kernelsize + 1;
            fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;
            for j = 1 : net.layers{l}.outputmaps  %  output map
                fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
                for i = 1 : inputmaps  %  input map
                    net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));
                end
                net.layers{l}.b{j} = 0;
            end
            inputmaps = net.layers{l}.outputmaps;
        end
    end

% 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons.
    % 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer.
    % 'ffb' is the biases of the output neurons.
    % 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum)
    fvnum = prod(mapsize) * inputmaps;
    onum = size(y, 1);

    net.ffb = zeros(onum, 1);
    net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));

前向傳播：ip

\begin{align}\notag x_j^l = f(\sum_ {i\in M_j} x_i^{l-1} * k_{ij}^l + b_j^l) \end{align} ci

%  !!below can probably be handled by insane matrix operations
            for j = 1 : net.layers{l}.outputmaps   %  for each output map
                %  create temp output map
                z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);
                for i = 1 : inputmaps   %  for each input map
                    %  convolve with corresponding kernel and add to temp output map
                    z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
                end
                %  add bias, pass through nonlinearity
                net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j});
            end
            %  set number of input maps to this layers number of outputmaps
            inputmaps = net.layers{l}.outputmaps;

前向傳播：input

\begin{align}\notag x_j^l = f(\beta_j^l down(x_j^{l-1}) + b_j^l) \end{align} it

%  downsample
            for j = 1 : inputmaps
                z = convn(net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid');   %  !! replace with variable
                net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);
            end

前向傳播：io

%  concatenate all end layer feature maps into vector
    net.fv = [];
    for j = 1 : numel(net.layers{n}.a)
        sa = size(net.layers{n}.a{j});
        net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
    end
    %  feedforward into output perceptrons
    net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));

sigmoid函數求導：

\begin{align}\notag f(x) = \frac{1}{1+e^{-x}} ; f^\prime(x) = \frac{e^{-x}}{(1+e^{-x})^2} = f(x) \cdot [1 - f(x)] \end{align}

對網絡的最後一層輸出層，計算輸出值和樣本值得殘差：

\begin{align}\notag \delta^n = -(y-a^n)\cdot f^\prime(z^n) \end{align}

%   error
    net.e = net.o - y;
    %%  backprop deltas
    net.od = net.e .* (net.o .* (1 - net.o));   %  output delta

對於隱層 $ l = n-1,n-2,n-3,...,2 $ ，計算各節點殘差：

\begin{align}\notag \delta^l = ({(W^l)}^T \delta^{l+1}) \cdot f^\prime(z^l) \end{align}

%  concatenate all end layer feature maps into vector
    net.fv = [];
    for j = 1 : numel(net.layers{n}.a)
        sa = size(net.layers{n}.a{j});
        net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
    end

net.fvd = (net.ffW' * net.od);              %  feature vector delta
    if strcmp(net.layers{n}.type, 'c')         %  only conv layers has sigm function
        net.fvd = net.fvd .* (net.fv .* (1 - net.fv));
    end

反向傳播：

\begin{align}\notag \delta_j^l = f^\prime(u_j^l)\circ conv2(\delta_j^{l+1},rot180(k_j^{l+1}),'full') \end{align}

for i = 1 : numel(net.layers{l}.a)
                z = zeros(size(net.layers{l}.a{1}));
                for j = 1 : numel(net.layers{l + 1}.a)
                     z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');
                end
                net.layers{l}.d{i} = z;
            end

反向傳播：

\begin{align}\notag \delta_j^l = \beta_j^{l+1}(f^\prime(u_j^l) \circ up(\delta_j^{l+1})) \end{align}

for j = 1 : numel(net.layers{l}.a)
                net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
            end

計算最終須要的偏導數值：

\begin{align}\notag \nabla_{W^l}J(W,b;x,y) = \delta^{l+1}(a^l)^T \end{align}

\begin{align}\notag \nabla_{b^l}J(W,b;x,y) = \delta^{l+1} \end{align}

\begin{align}\notag \nabla_{W^l}J(W,b) = [\frac{1}{m}\sum_{i=1}^m\nabla_{W^l}J(W,b;x,y)]+\lambda W_{ij}^l \end{align}

\begin{align}\notag \nabla_{b^l}J(W,b) = \frac{1}{m}\sum_{i=1}^m\nabla_{b^l}J(W,b;x,y) \end{align}

\begin{align}\notag \frac{\partial E}{\partial k_{ij}^l} = rot180(conv2(x_i^{l-1},rot180(\delta_j^l),'valid')) \end{align}

\begin{align}\notag \frac{\partial E}{\partial b_j} = \sum_{u,v}(\delta_j^l)_{uv} \end{align}

for l = 2 : n
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                for i = 1 : numel(net.layers{l - 1}.a)
                    net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);
                end
                net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3);
            end
        end
    end

net.dffW = net.od * (net.fv)' / size(net.od, 2);
    net.dffb = mean(net.od, 2);

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。