faster-rcnn 筆記

2019-02-18,15點00
'''
下面是別人寫的原始的筆記,我在上面本身補充了一些.

'''
#https://www.cnblogs.com/the-home-of-123/p/9747963.html




 #  以voc數據集爲例,按照imdb的命名,利用pascal_voc()函數生成不一樣的imdb

'''


for year in ['2007', '2012']:
  for split in ['train', 'val', 'trainval', 'test']:
    name = 'voc_{}_{}'.format(year, split)  #year='2007', split='trainval'
    __sets[name] = (lambda split=split, year=year: pascal_voc(split, year))


def get_imdb(name):
  """Get an imdb (image database) by name."""
  if name not in __sets:
    raise KeyError('Unknown dataset: {}'.format(name))
  return __sets[name]()

'''


# self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)  #數據庫路徑
# self._classes = ('__background__',  # always index 0, 訓練類別標籤,包含背景類
#                   'person')
#  # Default to roidb handler
# self._roidb_handler = self.gt_roidb #感興趣區域(ROI)數據庫
# self._salt = str(uuid.uuid4()) #??
# self._comp_id = 'comp4' # ??




















def _build_network(self, is_training=True):
    # select initializers進行初始化
    if cfg.TRAIN.TRUNCATED:
      initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
      initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
    else:
      initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
      initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)

    net_conv = self._image_to_head(is_training)##通過特徵提取網絡,初步提取特徵
    with tf.variable_scope(self._scope, self._scope):
      # build the anchors for the image
      self._anchor_component()###產生anchor
      # region proposal network ###產生proposal的座標
      rois = self._region_proposal(net_conv, is_training, initializer)
      #這裏面rois表示的是那些非背景的區域對應到feature_map上的座標組成的數組.
      '''
      上面一行的代碼是和興!!
      '''
      # region of interest pooling
      if cfg.POOLING_MODE == 'crop':
        pool5 = self._crop_pool_layer(net_conv, rois, "pool5") ###對產生的porposal進行ROI池化,統一格式
      else:
        raise NotImplementedError
      '''
      這裏面獲得的pool5就是把rois
      '''




    fc7 = self._head_to_tail(pool5, is_training)
    with tf.variable_scope(self._scope, self._scope):
      # region classification 輸入到Fast-RCNN網絡中,對樣本進行分類和預測框迴歸
      cls_prob, bbox_pred = self._region_classification(fc7, is_training,
                                                        initializer, initializer_bbox)

    '''
    利用self._region_classification 裏面的fc 層和softmax層輸出 cls_prob, bbox_pred.獲得最總的預測結果.
    '''
    self._score_summaries.update(self._predictions)

    return rois, cls_prob, bbox_pred



'''
下面是上面說的核心代碼的分析
'''

def _region_proposal(self, net_conv, is_training, initializer):
    rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,
                        scope="rpn_conv/3x3") ##通過一個3X3卷積,以後分兩條線
    self._act_summaries.append(rpn)

    '''
    下面一行的代碼就是唐老師說的最精髓的地方.
    anchors這些概念都是虛擬的.其實都沒有.都是經過學習獲得的.
    
    輸出的維度是self._num_anchors * 2,   每2個數表示一種anchor對應的得分.至於到底哪一個數對應哪一個anchor,
    不用指明,這些徹底是經過學習得到的.這樣避免人工干預,效果更好.更加end_to_end.
    
    從這裏面kernal=[1*1]就表示每個像素點對應9個anchor!
    
    
    '''


    rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
                                weights_initializer=initializer,
                                padding='VALID', activation_fn=None, scope='rpn_cls_score') ###第一條線產生預測類別肯定是背景仍是類別

    '''
    我納悶的地方是這裏面獲得的rpn_cls_score:(1,height,width,18) 表示的是9個框的分數.而表示不了各個分類的分數
    那麼後面的nms怎麼作?
    
    其實這個地方只是對是不是背景作nms
    '''



    # change it so that the score has 2 as its channel size
    rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
    rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
    rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred")



    rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")




    rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,  ###第二條線產生預測框座標,對預測框座標進行預測
                                weights_initializer=initializer,
                                padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
    if is_training:
      rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") ###根據預測的類別和預測框座標對porposa進行篩選,對前N個進行NMS,這裏面nms只是找這些框,那些不是背景的得分高.把那些是背景機率高的框去掉.
      rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
      # Try to have a deterministic order for the computing graph, for reproducibility
      with tf.control_dependencies([rpn_labels]):
        rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
    else:
      if cfg.TEST.MODE == 'nms':
        rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
      elif cfg.TEST.MODE == 'top':
        rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
      else:
        raise NotImplementedError

    self._predictions["rpn_cls_score"] = rpn_cls_score
    self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
    self._predictions["rpn_cls_prob"] = rpn_cls_prob
    self._predictions["rpn_cls_pred"] = rpn_cls_pred
    self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
    self._predictions["rois"] = rois

    return rois






























def _crop_pool_layer(self, bottom, rois, name): ####bottom爲convert層卷積輸出---也就是特徵圖, feat_stride爲補償乘積,用來求得原圖的w,h.rois爲選出的256個anchor的座標,這些座標是特徵圖上的座標.
    '''
    結果就是在特徵圖上,把rois這些子圖都扣出來.而後按照比例反映射到原始input_image裏面的部分.
    相似感覺眼這個東西.



    '''
    with tf.variable_scope(name) as scope:
      batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
      # Get the normalized coordinates of bounding boxes
      bottom_shape = tf.shape(bottom)
      height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
      width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])

      '''
      yinwei rois是針對原始圖片的座標.因此相對座標是須要/htight or width
      '''
      x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
      y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
      x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
      y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height###獲得相對位置
      # Won't be back-propagated to rois anyway, but to save time

      '''
      由於bboxes.須要的是圖片中子圖的相對座標位置,也就是4個百分比位置
      '''
      bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
      '''
      表示bboxes這個變量,不計算梯度.
      '''


      pre_pool_size = cfg.POOLING_SIZE * 2
      crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")##利用tensorflow的自帶函數做用相似於ROI池化
      '''
      由於pre_pool_size = cfg.POOLING_SIZE * 2,因此下面再maxpool一下把圖片縮小回去.
      '''
    return slim.max_pool2d(crops, [2, 2], padding='SAME')

import tensorflow as tf
# help(tf.image.crop_and_resize)




def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
    cls_score = slim.fully_connected(fc7, self._num_classes,
                                       weights_initializer=initializer,
                                       trainable=is_training,
                                       activation_fn=None, scope='cls_score')
    cls_prob = self._softmax_layer(cls_score, "cls_prob")
    cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred")
    bbox_pred = slim.fully_connected(fc7, self._num_classes * 4,
                                     weights_initializer=initializer_bbox,
                                     trainable=is_training,
                                     activation_fn=None, scope='bbox_pred')

    self._predictions["cls_score"] = cls_score
    self._predictions["cls_pred"] = cls_pred
    self._predictions["cls_prob"] = cls_prob
    self._predictions["bbox_pred"] = bbox_pred

    return cls_prob, bbox_pred
View Code
相關文章
相關標籤/搜索