（原）caffe中的conv

時間 2019-11-06

標籤 caffe conv 简体版

原文原文鏈接

轉載請註明出處：html

http://www.javashuo.com/article/p-odautfpi-e.htmlide

conv整體調用流程以下圖所示：函數

說明：帶o的爲輸出，如Wo表明輸出寬度；帶i的爲輸入，如Hi表明輸入高度this

1. 前向傳播的計算ConvolutionLayer<Dtype>::Forward_cpu

注：不考慮反向傳播的計算過程…spa

前向傳播時，分別調用base_conv_layer.cpp中的BaseConvolutionLayer<Dtype>::forward_cpu_gemm和base_conv_layer.cpp中的BaseConvolutionLayer<Dtype>::forward_cpu_bias.net

 1     template <typename Dtype>
 2     void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 3         const vector<Blob<Dtype>*>& top) 
 4     {
 5         const Dtype* weight = this->blobs_[0]->cpu_data();     // weight參數
 6         for (int i = 0; i < bottom.size(); ++i) {              // 多少個輸入。通常1個的比較常見吧
 7             const Dtype* bottom_data = bottom[i]->cpu_data();    // 第i個輸入：NCHiWi
 8             Dtype* top_data = top[i]->mutable_cpu_data();        // 第i個輸出：NCHoWo
 9             for (int n = 0; n < this->num_; ++n) {               // batchsize
10                 //forward_cpu_gemm輸入爲第n個channel的起始位置(C*Hi*Wi)，及權重參數(No*Ni*Kh*Kw)，輸出爲第n個channel的起始位置,(C*Ho*Wo)
11                 this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight, top_data + n * this->top_dim_);
12                 if (this->bias_term_) {                            // 含有bias
13                     const Dtype* bias = this->blobs_[1]->cpu_data(); // bias參數
14                     this->forward_cpu_bias(top_data + n * this->top_dim_, bias);  // 計算增長bias後的輸出
15                 }
16             }
17         }
18     }

在forward以前，計算輸出特徵的尺寸函數爲compute_output_shapecode

 1     template <typename Dtype>
 2     void ConvolutionLayer<Dtype>::compute_output_shape() {
 3         const int* kernel_shape_data = this->kernel_shape_.cpu_data();
 4         const int* stride_data = this->stride_.cpu_data();
 5         const int* pad_data = this->pad_.cpu_data();
 6         const int* dilation_data = this->dilation_.cpu_data();   // 卷積核膨脹的寬高，默認爲1；核膨脹，即在覈中間加0
 7         this->output_shape_.clear();
 8         for (int i = 0; i < this->num_spatial_axes_; ++i) {   // HW總共維度，num_spatial_axes_=2
 9             // i + 1 to skip channel axis
10             const int input_dim = this->input_shape(i + 1); //inline int input_shape(int i) {return (*bottom_shape_)[channel_axis_ + i];}
11             const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;  //獲得膨脹以後的核的尺寸
12             const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent) / stride_data[i] + 1;  //獲得輸出特徵的尺寸
13             this->output_shape_.push_back(output_dim);   // 輸出特徵寬高
14         }
15     }

2. forward_cpu_gemm

該函數首先判斷是否爲1*1的卷積，若是不是，則調用conv_im2col_cpu函數，將輸入ChiWi變換成(C*Kh*Kw)*Ho*Wo的臨時矩陣col_buffer_。htm

以後調用caffe_cpu_gemm，每次計算一部分輸出，若是group_爲1，則一次計算完：output（Co*（Ho*Wo））=1* weights（Co*（Ci*Kh*Kw））* col_buff（（Ci*Kh*Kw）*（Ho*Wo）） + 0* outputblog

 1     template <typename Dtype>
 2     void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
 3         const Dtype* weights, Dtype* output, bool skip_im2col) {  //bool skip_im2col = false
 4         const Dtype* col_buff = input;
 5         if (!is_1x1_) {  // 不是1*1卷積
 6             if (!skip_im2col) 
 7             {
 8                 // 調用base_conv_layer.hpp中的im2col_cpu，將輸入CiHiWi變換成(Ci*Kh*Kw)*Ho*Wo的臨時變量
 9                 // 因爲調用本函數的函數ConvolutionLayer<Dtype>::Forward_cpu中調用batchsize次本函數，於是本函數內部不包含batchsize
10                 conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); 
11             }
12             col_buff = col_buffer_.cpu_data();
13         }
14         for (int g = 0; g < group_; ++g) {  // group_默認爲1
15             caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_,  // Co
16                 conv_out_spatial_dim_, kernel_dim_,    // Ho*Wo    //  卷積核的Ci*Kh*Kw
17                 (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
18                 (Dtype)0., output + output_offset_ * g);
19         }
20     }

3. conv_im2col_cpu

該函數爲內聯函數，對im2col_cpu進行了封裝，方便調用，以下：ip

 1         inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
 2             if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
 3                 im2col_cpu(data, conv_in_channels_,
 4                     conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
 5                     kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
 6                     pad_.cpu_data()[0], pad_.cpu_data()[1],
 7                     stride_.cpu_data()[0], stride_.cpu_data()[1],
 8                     dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
 9             }
10             else {
11                 im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
12                     col_buffer_shape_.data(), kernel_shape_.cpu_data(),
13                     pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
14             }
15         }

4. im2col_cpu

該函數用於將圖像轉換成卷積所需的列格式。a中黑色實線方框中爲特徵（或像素），虛線中爲邊界填充的0，紅色虛線框爲3*3的卷積核大小。如對於a所示的7*9輸入圖像（爲方便b中的顯示，於是a中值爲1—63），四個邊界各填充一個0後，經過該函數，獲得的col格式如b所示，其中紅色虛線爲a中的位置對應的列格式的像素。b中…表明依次遞增的5個特徵。能夠認爲b中矩陣爲一個kernel_h*kernel_w*output_h*output_w的行向量，也能夠認爲是一個（kernel_h*kernel_w）*（output_h*output_w）的2維的矩陣（每一行的長度爲output_h*output_w）。經過這種方式獲得的col格式數據，與卷積核可經過矩陣相乘，提升運算速度。

該函數代碼以下。其中output_rows的for循環對應b中的藍色箭頭範圍，output_col的for循環對應b中的橙色半框範圍。

 1     template <typename Dtype>
 2     void im2col_cpu(const Dtype* data_im, const int channels,  // channels爲輸入特徵個數
 3         const int height, const int width, const int kernel_h, const int kernel_w,
 4         const int pad_h, const int pad_w,  // 特徵邊界填充的寬高
 5         const int stride_h, const int stride_w,   // 間隔的寬高
 6         const int dilation_h, const int dilation_w, // 卷積核膨脹的寬高，默認爲1；核膨脹，即在覈中間加0 // https://blog.csdn.net/wangyuxi__/article/details/83003357
 7         Dtype* data_col) {  //  爲(kernel_h*kernel_w)*(output_h*output_w)的緩衝區。每一行爲滑動窗口的某個位置對應的全部特徵
 8         const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;  // 輸出特徵寬高
 9         const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
10         const int channel_size = height * width;  // 輸入特徵的每一個通道的總特徵數
11         for (int channel = channels; channel--; data_im += channel_size)   // 每次循環完畢，輸入特徵偏移一個通道
12         {  
13             for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) 
14             {
15                 for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) 
16                 {
17                     int input_row = -pad_h + kernel_row * dilation_h;  // 每次核在特徵上的起始行座標
18                     for (int output_rows = output_h; output_rows; output_rows--)  // 遍歷輸入特徵每行
19                     {
20                         if (!is_a_ge_zero_and_a_lt_b(input_row, height))   // a<0 或者 a>=b，即當前行超出輸入邊界
21                         { 
22                             for (int output_cols = output_w; output_cols; output_cols--)  // 每列填0
23                             {
24                                 *(data_col++) = 0;
25                             }
26                         }
27                         else {   // 當前行在輸入邊界內
28                             int input_col = -pad_w + kernel_col * dilation_w;  // 每次核在特徵上的起始列座標
29                             for (int output_col = output_w; output_col; output_col--)  // 遍歷輸入特徵每列
30                             {
31                                 if (is_a_ge_zero_and_a_lt_b(input_col, width))    // 當前列在輸入邊界內
32                                 {  
33                                     *(data_col++) = data_im[input_row * width + input_col];  // 將輸入特徵賦值給data_col
34                                 }
35                                 else   // 當前列超出輸入邊界
36                                 {  
37                                     *(data_col++) = 0;
38                                 }
39                                 input_col += stride_w;  // 輸入特徵位置增長stride_w
40                             }
41                         }
42                         input_row += stride_h;  // 輸入特徵位置增長stride_h
43                     }
44                 }
45             }
46         }
47     }

5. BaseConvolutionLayer<Dtype>::forward_cpu_bias

該函數爲output =1*bias（C*1）* bias_multiplier_（1*（H*W））+ 1*output。其中C爲輸出特徵的通道數No，H爲特徵高Ho，W爲特徵寬Wo，最終獲得某個batch中CoHoWo的特徵。

1     template <typename Dtype>
2     void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
3         const Dtype* bias) {
4         caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,   //輸出特徵維度No
5             out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),  // Wo*Ho
6             (Dtype)1., output);
7     }

bias_multiplier_爲1*（Wo*Ho）的向量，在void BaseConvolutionLayer<Dtype>::Reshape中將其全部的值均設置爲1：

1         out_spatial_dim_ = top[0]->count(first_spatial_axis);  // Wo*Ho
2         if (bias_term_) {
3             vector<int> bias_multiplier_shape(1, out_spatial_dim_);
4             bias_multiplier_.Reshape(bias_multiplier_shape);
5             caffe_set(bias_multiplier_.count(), Dtype(1),  // bias_multiplier_爲1*（Wo*Ho）的向量，全部元素值爲1
6                 bias_multiplier_.mutable_cpu_data());
7         }

6. caffe_cpu_gemm

該函數調用cblas_sgemm，實現矩陣相乘：

 1 template<>
 2 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
 3     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
 4     const float alpha, const float* A, const float* B, const float beta,
 5     float* C) {
 6   int lda = (TransA == CblasNoTrans) ? K : M;
 7   int ldb = (TransB == CblasNoTrans) ? N : K;
 8   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
 9       ldb, beta, C, N);
10 }

cblas_sgemm具體見：http://www.cnblogs.com/darkknightzh/p/5553336.html

1. Conv in caffe
2. matlab中conv、filter的使用
3. 深度學習中的卷積(conv)和空洞卷積(dilated conv)
4. Intel Caffe 與原生Caffe
5. Deformable Conv
6. deformable conv
7. Deformable CONV
8. matlab中conv和filter的區別
9. mobile net 的 depthwise conv 和 origin conv 的對比
10. conv優化
更多相關文章...
• MyBatis的工作原理 - MyBatis教程
• Spring中Bean的作用域 - Spring教程
• C# 中 foreach 遍歷的用法
• Scala 中文亂碼解決

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。