Caffe源碼-im2col操做

時間 2020-01-13

標籤 caffe 源碼 im2col col 简体版

原文原文鏈接

目錄ide

@(Caffe源碼-im2col操做)函數

im2col簡介

caffe的卷積操做中使用im2col來加速，im2col將卷積核中的每一個點在圖像上的對應點全都提取出來按行排列，獲得一個矩陣，這樣就將卷積操做轉化爲矩陣進行操做。
oop

如上圖所示的，假設輸入圖像的形狀爲channels=1, height=width=5，而且pad_w=pad_h=1, kernel_h=kernel_w=3, stride_h=stride_w=2, dilation_w=dilation_h=1。左側圖中藍色爲padding補充的邊界，值均爲0，綠色爲實際圖像的數據。其中卷積核中\(k_{00}\)位置在整個卷積操做中共計算了output_h*output_w=9次，每次的位置在左側圖中用黑色實心圓標註出來。而im2col操做便是將卷積核上的每一個點的這些對應位置上的值都提取出來，按照右側黃色方格的形式存放起來。這樣卷積操做可簡單地經過將卷積核（中間的紅色方格）展成一個向量，而後與右側的黃色方格矩陣中的每一列點乘來實現。更詳細的說明可查看後面列出來的參考博客。
與im2col對應的是col2im操做，便是將矩陣還原成卷積前的圖像的形狀，不過caffe代碼中的col2im_cpu()函數還稍微有些改動。測試

im2col.cpp源碼

// Function uses casting from int to unsigned to compare if value of
// parameter a is greater or equal to zero and lower than value of
// parameter b. The b parameter is of type signed and is always positive,
// therefore its value is always lower than 0x800... where casting
// negative value of a parameter converts it to value higher than 0x800...
// The casting allows to use one condition instead of two.
inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
}

// data_im爲輸入的圖像數據,單個圖像數據,num=1, data_col爲轉化後的矩陣
// channels/height/width爲圖像的通道數/高度/寬度
// kernel_h/kernel_w爲卷積核的高度/寬度
// pad_h/pad_w爲卷積時圖像的高度和寬度方向的邊界補充大小
// stride_h/stride_w爲卷積時高度和寬度方向的步進大小
// dilation_h/dilation_w爲卷積時卷積核的空洞係數
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    Dtype* data_col) {
  //(dilation_h * (kernel_h - 1) + 1)和(dilation_w * (kernel_w - 1) + 1)爲帶上空洞係數的卷積核的尺寸 
  const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; //計算輸出圖像的尺寸
  const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channel_size = height * width;    //輸入圖像的每一個通道的大小
  for (int channel = channels; channel--; data_im += channel_size) {    //處理輸入圖像的每一個通道
    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {     //處理卷積核的每行
      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {   //處理卷積核的每列
        //開始計算卷積核的(kernel_row, kernel_col)點在輸入圖像的全部對應位置(input_row, input_col),
        //並將輸入圖像該位置的值存入data_col中,若是(kernel_row, kernel_col)點對應輸入圖像的padding位置,則存入0
        //卷積核上的每一個點都有 output_h * output_w 個對應位置,輸入圖像的每行有output_w個對應位置,共output_h行

        int input_row = -pad_h + kernel_row * dilation_h;   //第一次卷積時卷積核的該點對應輸入圖像的第input_row行
        // output_rows在循環體中並無使用,因此此處是從output_h減至0仍是從0增至output_h的效果是同樣的
        for (int output_rows = output_h; output_rows; output_rows--) {  //處理該點在輸入圖像每一行的對應位置
          //不知足0 ≤ input_row < height,則在此處卷積時卷積核的第kernel_row行對應着輸入圖像的邊界以外的第input_row行
          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
            //卷積核的該點對應輸入圖像的邊界外的行,則計算輸出圖像時,整行的對應位置都應在邊界外,整行一共有output_w個對應位置
            for (int output_cols = output_w; output_cols; output_cols--) {
              *(data_col++) = 0;    //所有置爲0
            }
          } else {    //卷積核的該點在圖像內部
            int input_col = -pad_w + kernel_col * dilation_w; //第一次卷積時卷積核的該點對應輸入圖像的第input_col列
            for (int output_col = output_w; output_col; output_col--) { //處理該點在輸入圖像每一列的對應位置
              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {    //一樣判斷對應位置的列是否在圖像邊界外
                *(data_col++) = data_im[input_row * width + input_col]; //圖像內部,則將輸入圖像(input_row, input_col)處的值存入
              } else {
                *(data_col++) = 0;    //(input_row, input_col)在圖像外,存入0
              }
              input_col += stride_w;  //循環,寬度方向上的移動,卷積核的該點每次對應輸入圖像的(input_row, input_col)位置
            }
          }
          input_row += stride_h;      //循環,高度方向上的移動,每次對應輸入圖像的(input_row, input_col)位置
        }
      }
    }
  }
}

// Explicit instantiation
template void im2col_cpu<float>(const float* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    float* data_col);
template void im2col_cpu<double>(const double* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    double* data_col);

//col_shape的值爲[k_dim0*k_dim1*...*channel_in, col_dim0, col_dim1, ...]
//[col_dim0, col_dim1, ...]爲卷積操做以後的圖像的各個維度的大小
template <typename Dtype>
inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
    const int num_spatial_axes, const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, Dtype* data_output) {
  //*_dim0表示第0維的大小,dim0_?表示第0維上的位置?
  if (!im2col) {    //不是image to column,則是column to image
    int im_size = im_shape[0];
    for (int i = 0; i < num_spatial_axes; ++i) {
      im_size *= im_shape[1 + i];   //計算圖像的大小,im_dim0*im_dim1*...
    }
    caffe_set(im_size, Dtype(0), data_output);    //數據先清空
  }
  //kernel_shape中存放着卷積核中參與卷積的各個維度的值[k_dim0,k_dim1,...]
  //在2Dconv中, num_spatial_axes=2, H*W維度參與卷積, 卷積核在C維度上累加, 則kernel_shape爲H*W
  int kernel_size = 1;
  for (int i = 0; i < num_spatial_axes; ++i) {
    kernel_size *= kernel_shape[i]; //單個通道的卷積核的大小k_dim0*k_dim1*...
  }
  //col_buf中的第0維等於卷積核的大小乘上輸入圖像的通道數,後面幾維爲輸出圖像參與卷積的那幾維的大小
  const int channels_col = col_shape[0];      //col_buf的第0維的大小col_dim0
  //col_buf中的(c_col, out_dim0?, out_dim1?, ...)的位置存放着卷積核的(out_num?, im_channel?, d_offset[0], d_offset[1], ...)點對應的全部輸入圖像的值
  vector<int> d_offset(num_spatial_axes, 0);  //num_spatial_axes大小的向量,初始爲0
  vector<int> d_iter(num_spatial_axes, 0);
  for (int c_col = 0; c_col < channels_col; ++c_col) {      //c_col即爲單個卷積核上的每個點
    // Loop over spatial axes in reverse order to compute a per-axis offset.
    int offset = c_col;
    for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) { //從末尾維(如2D卷積的W維)開始計算
      if (d_i < num_spatial_axes - 1) {
        offset /= kernel_shape[d_i + 1];  //除以第d_i + 1維的大小,獲得點c_col在第0維到第d_i維之間的索引
      }
      d_offset[d_i] = offset % kernel_shape[d_i]; //獲得點c_col在第d_i維的位置,存入d_offset中
    }
    //卷積核的(out_num?, im_channel?, d_offset[0], d_offset[1], ...)點對應卷積核的點c_col,
    //可是此處還只是計算了參與卷積的幾個維度d_offset[...], 點c_col中還包含了在卷積核累加的維度上的索引im_channel?
    for (bool incremented = true; incremented; ) {
      // Loop over spatial axes in forward order to compute the indices in the
      // image and column, and whether the index lies in the padding.
      int index_col = c_col;
      //判斷index_col的含義時可將下面的代碼單獨抽離出來, index_col = (...((c_col * col_dim1 + d0) * col_dim2 + d1) * ... + ...)
      // for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
      //   const int d = d_iter[d_i];
      //   index_col *= col_shape[d_i + 1];
      //   index_col += d;
      // }

      int index_im = c_col / kernel_size;   //獲得點c_col中在卷積核累加的維度上的索引im_channel的確切值
      bool is_padding = false;
      for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
        const int d = d_iter[d_i];          //整個卷積核在第d_i維度的移動位置
        //獲得點c_col在卷積輸入圖像中第d_i維度上的索引d_im
        const int d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
        is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];  //存在任何超出邊界的點,則is_padding爲true
        index_col *= col_shape[d_i + 1];  //col_shape[1],col_shape[2]...爲col_buf中圖像的維度的大小
        index_col += d;                   //再加上位置,最終index_col爲在d_iter表示的圖像位置卷積時卷積核上的點c_col在col_buf中的索引
        index_im *= im_shape[d_i + 1];
        index_im += d_im;   //最終index_im爲在d_iter表示的圖像位置卷積時卷積核上的點c_col對應的圖像點的索引
      }
      if (im2col) {         //圖像轉矩陣
        if (is_padding) {
          data_output[index_col] = 0;   //點c_col這次卷積時超出圖像,則col_buf中置爲0
        } else {
          data_output[index_col] = data_input[index_im];  //設置col_buf的值
        }
      } else if (!is_padding) {  // col2im  //矩陣轉圖像,而且未在圖像邊界外,則設置im_buf的值
        data_output[index_im] += data_input[index_col];
      }
      // Loop over spatial axes in reverse order to choose an index, like counting.
      incremented = false;
      //判斷下一次卷積位置在各維度中的值,即d_iter中的值.若是卷積位置到了某一維度的末尾,則從新置爲0,而且在下一維度上的值自增.
      //若是下一維度一樣已經到了末尾,則在下下一維自增,如此重複,直至最終某一維位置自增了.
      //若是全部的維度都已經到了末尾位置,則自增標誌incremented爲false,則說明點c_col對應的各個圖像位置都已經判斷完畢
      for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
        const int d_max = col_shape[d_i + 1];   //col_shape的第d_i + 1維對應卷積輸出圖像的第d_i維的大小
        //d_iter是卷積核在第d_i維度的移動位置,每一個位置也便是輸出圖像上的一個點
        DCHECK_LT(d_iter[d_i], d_max);    //小於該維度的最大值
        if (d_iter[d_i] == d_max - 1) {
          d_iter[d_i] = 0;                //到了末尾,則該維度從新置爲0
        } else {  // d_iter[d_i] < d_max - 1
          ++d_iter[d_i];                  //該維度不在末尾,則該維度自增
          incremented = true;             //設置標誌,已自增.若是
          break;                          //退出
        }
      }
    }  // while(incremented) {
  }  // for (int c = 0; c < channels_col; ++c) {
}

template <typename Dtype>
void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, Dtype* data_col) {
  const bool kIm2Col = true;
  im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
                  kernel_shape, pad, stride, dilation, data_col);
}

// Explicit instantiation
template void im2col_nd_cpu<float>(const float* data_im,
    const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, float* data_col);
template void im2col_nd_cpu<double>(const double* data_im,
    const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, double* data_col);

//矩陣轉圖像,data_col爲矩陣,形狀爲[kernel_h*kernel_w*channels, output_h*output_w]
//data_im爲卷積前的圖像,形狀爲[channels, height, width]
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    Dtype* data_im) {
  caffe_set(height * width * channels, Dtype(0), data_im);    //先將圖像數據清零
  //計算卷積後的圖像的寬高
  const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channel_size = height * width;    //卷積前圖像的單個通道的大小
  
  for (int channel = channels; channel--; data_im += channel_size) {    //處理每一個通道
    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {     //處理卷積核的第kernel_row行
      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {   //處理卷積核的第kernel_col列
        //data_col的第0維的大小維kernel_h*kernel_w*channels,因此此處的三個循環至關因而處理data_col的第0維的每一個數據
        //假設是處理data_col的第0維的第kernel_idx個數據, kernel_idx = (channel * kernel_h + kernel_row) * kernel_w + kernel_col
        //同時第kernel_idx個數據也對應卷積核中的點(1, channel, kernel_row, kernel_col)點
        int input_row = -pad_h + kernel_row * dilation_h;   //卷積核的該點在初次卷積時對應卷積前圖像的第input_row行
        for (int output_rows = output_h; output_rows; output_rows--) {
          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {  //input_row不在[0,height)之間,即對應圖像的padding位置
            data_col += output_w;   //則一整列都會在圖像邊界外,直接跳過整行的數據
          } else {
            int input_col = -pad_w + kernel_col * dilation_w; //卷積核的該點在初次卷積時對應卷積前圖像的第input_col列
            for (int output_col = output_w; output_col; output_col--) {
              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {  //input_col不在[0,width)之間,直接跳過,不然將
                //注意,此處是累加.因此若是卷積前圖像的某個點被屢次用於卷積操做時,其數值是會累加的
                data_im[input_row * width + input_col] += *data_col;
              }
              data_col++;   //下一個
              input_col += stride_w;  //卷積核的該點在下一次卷積時的圖像位置
            }
          }
          input_row += stride_h;  //卷積核的該點在下一次卷積時的圖像位置
        }
      }
    }
  }
}

// Explicit instantiation
template void col2im_cpu<float>(const float* data_col, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    float* data_im);
template void col2im_cpu<double>(const double* data_col, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    double* data_im);

template <typename Dtype>
void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, Dtype* data_im) {
  const bool kIm2Col = false;
  im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
                     kernel_shape, pad, stride, dilation, data_im);
}

// Explicit instantiation
template void col2im_nd_cpu<float>(const float* data_col,
    const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, float* data_im);
template void col2im_nd_cpu<double>(const double* data_col,
    const int num_spatial_axes,
    const int* im_shape, const int* col_shape,
    const int* kernel_shape, const int* pad, const int* stride,
    const int* dilation, double* data_im);

小結

注意代碼中col2im_cpu()函數與im2col_cpu()函數不是嚴格的逆操做。若是圖像的某個點在卷積時被屢次使用過，那麼在矩陣轉爲圖像時該位置的圖像值一樣會被屢次累加（應該是爲了方便計算卷積層反傳時的梯度，不過筆者還未看這部分），因此還原的圖像並非真實的卷積前的圖像。im2col_nd_core_cpu()函數中也是如此。
im2col_nd_core_cpu()函數實現了高維卷積的數據轉矩陣操做，高維卷積中除了用於計算卷積值的那幾個維度（卷積核也在這些維度上移動），還有一個更高維的維度用於累加捲積核，相似於2維卷積中的channel維度。
im2col.cpp文件中的這幾個函數與caffe關聯較少，可本身寫個demo測試各個函數的功能以及單步調試，方便理解。