再次梳理Faster RCNN的流程

为与Faster R-CNN 入坑之源码阅读结合起来，今天又再次查看源码。这的代码是用的旧版的object detection api。

Faster RCNN是从基模型提取特征，之后再提取box location 和classes scores。

今天就要好好理一理这个流程了。代码超长。

def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              spatial_squeeze=True,
              store_non_strided_activations=False,
              reuse=None,
              scope=None):
  with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with slim.arg_scope([slim.conv2d, bottleneck,
                         resnet_utils.stack_blocks_dense],
                        outputs_collections=end_points_collection):
      with (slim.arg_scope([slim.batch_norm], is_training=is_training)
            if is_training is not None else NoOpScope()):
        net = inputs
        if include_root_block:
          if output_stride is not None:
            if output_stride % 4 != 0:
              raise ValueError('The output_stride needs to be a multiple of 4.')
            output_stride /= 4
          net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
          net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
                                              store_non_strided_activations)
        # Convert end_points_collection into a dictionary of end_points.
        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)

        if global_pool:
          # Global average pooling.
          net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
          end_points['global_pool'] = net
        if num_classes:
          net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
                            normalizer_fn=None, scope='logits')
          end_points[sc.name + '/logits'] = net
          if spatial_squeeze:
            net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
            end_points[sc.name + '/spatial_squeeze'] = net
          end_points['predictions'] = slim.softmax(net, scope='predictions')
        return net, end_points
resnet_v1.default_image_size = 224


def resnet_v1_50(inputs,
                 num_classes=None,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
                 spatial_squeeze=True,
                 store_non_strided_activations=False,
                 reuse=None,
                 scope='resnet_v1_50'):
  """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
  blocks = [
      resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
      resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
      resnet_v1_block('block3', base_depth=256, num_units=6, stride=2),
      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
  ]
  return resnet_v1(inputs, blocks, num_classes, is_training,
                   global_pool=global_pool, output_stride=output_stride,
                   include_root_block=True, spatial_squeeze=spatial_squeeze,
                   store_non_strided_activations=store_non_strided_activations,
                   reuse=reuse, scope=scope)

所以说在resnet_v1_50返回的就是net 和 end_points

在特征提取器里面

def _extract_proposal_features(self, preprocessed_inputs, scope):
  
    if len(preprocessed_inputs.get_shape().as_list()) != 4:
      raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
                       'tensor of shape %s' % preprocessed_inputs.get_shape())
    shape_assert = tf.Assert(
        tf.logical_and(
            tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
            tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
        ['image size must at least be 33 in both height and width.'])

    with tf.control_dependencies([shape_assert]):
      # Disables batchnorm for fine-tuning with smaller batch sizes.
      # TODO: Figure out if it is needed when image batch size is bigger.
      with slim.arg_scope(
          resnet_utils.resnet_arg_scope(
              batch_norm_epsilon=1e-5,
              batch_norm_scale=True,
              weight_decay=self._weight_decay)):
        with tf.variable_scope(
            self._architecture, reuse=self._reuse_weights) as var_scope:
          _, activations = self._resnet_model(
              preprocessed_inputs,
              num_classes=None,
              is_training=False,
              global_pool=False,
              output_stride=self._first_stage_features_stride,
              spatial_squeeze=False,
              scope=var_scope)

    handle = scope + '/%s/block3' % self._architecture
    return activations[handle]

这里的activations[handle]返回就是net['resnet_v1_50/block3']

在特征提取之后

def _predict_rpn_proposals(self, rpn_box_predictor_features):
    
    num_anchors_per_location = (
        self._first_stage_anchor_generator.num_anchors_per_location())
    if len(num_anchors_per_location) != 1:
      raise RuntimeError('anchor_generator is expected to generate anchors '
                         'corresponding to a single feature map.')
    box_predictions = self._first_stage_box_predictor.predict(
        rpn_box_predictor_features,
        num_anchors_per_location[0],
        scope=self.first_stage_box_predictor_scope)

    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    objectness_predictions_with_background = box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
    return (tf.squeeze(box_encodings, axis=2),
            objectness_predictions_with_background)

这里的_first_stage_box_predictor调用的是box_predictor.py的ConvolutionalBoxPredictor

def _predict(self, image_features, num_predictions_per_location):
   
    features_depth = static_shape.get_depth(image_features.get_shape())
    depth = max(min(features_depth, self._max_depth), self._min_depth)

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1
    net = image_features
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net = slim.conv2d(
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(
            net, num_predictions_per_location * self._box_code_size,
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')
        if self._use_dropout:
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')
        if self._apply_sigmoid_to_scores:
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape())
    if batch_size is None:
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}

这里返回了box_encodings和class_predictions_with_background。

在特征box进行提取之后，由进行了flattened_proposal_feature_maps的特征提取。

box_classifier_features = (
        self._feature_extractor.extract_box_classifier_features(
            flattened_proposal_feature_maps,
            scope=self.second_stage_feature_extractor_scope))

看这里，其实这里没改变特征图的尺寸。

def _extract_box_classifier_features(self, proposal_feature_maps, scope):
    
    with tf.variable_scope(self._architecture, reuse=self._reuse_weights):
      with slim.arg_scope(
          resnet_utils.resnet_arg_scope(
              batch_norm_epsilon=1e-5,
              batch_norm_scale=True,
              weight_decay=self._weight_decay)):
        with slim.arg_scope([slim.batch_norm], is_training=False):
          blocks = [
              resnet_utils.Block('block4', resnet_v1.bottleneck, [{
                  'depth': 2048,
                  'depth_bottleneck': 512,
                  'stride': 1
              }] * 3)
          ]
          proposal_classifier_features = resnet_utils.stack_blocks_dense(
              proposal_feature_maps, blocks)
    return proposal_classifier_features

在测试文件里面。

 def test_extract_box_classifier_features_returns_expected_size(self):
    feature_extractor = self._build_feature_extractor(
        first_stage_features_stride=16)
    proposal_feature_maps = tf.random_uniform(
        [3, 7, 7, 1024], maxval=255, dtype=tf.float32)
    proposal_classifier_features = (
        feature_extractor.extract_box_classifier_features(
            proposal_feature_maps, scope='TestScope'))
    features_shape = tf.shape(proposal_classifier_features)

    init_op = tf.global_variables_initializer()
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
      self.assertAllEqual(features_shape_out, [3, 7, 7, 2048])

之后就是fast rcnn的精确坐标预测，在_mask_rcnn_box_predictor里面。

box_predictions = self._mask_rcnn_box_predictor.predict(
        box_classifier_features,
        num_predictions_per_location=1,
        scope=self.second_stage_box_predictor_scope)
    refined_box_encodings = tf.squeeze(
        box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
    class_predictions_with_background = tf.squeeze(box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1)

    absolute_proposal_boxes = ops.normalized_to_image_coordinates(
        proposal_boxes_normalized, image_shape, self._parallel_iterations)

    prediction_dict = {
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background':
        class_predictions_with_background,
        'num_proposals': num_proposals,
        'proposal_boxes': absolute_proposal_boxes,
    }

还在box_predictor里面。

def _predict(self, image_features, num_predictions_per_location):
    
    if num_predictions_per_location != 1:
      raise ValueError('Currently FullyConnectedBoxPredictor only supports '
                       'predicting a single box per class per location.')
    spatial_averaged_image_features = tf.reduce_mean(image_features, [1, 2],
                                                     keep_dims=True,
                                                     name='AvgPool')
    flattened_image_features = slim.flatten(spatial_averaged_image_features)
    if self._use_dropout:
      flattened_image_features = slim.dropout(flattened_image_features,
                                              keep_prob=self._dropout_keep_prob,
                                              is_training=self._is_training)
    with slim.arg_scope(self._fc_hyperparams):
      box_encodings = slim.fully_connected(
          flattened_image_features,
          self._num_classes * self._box_code_size,
          activation_fn=None,
          scope='BoxEncodingPredictor')
      class_predictions_with_background = slim.fully_connected(
          flattened_image_features,
          self._num_classes + 1,
          activation_fn=None,
          scope='ClassPredictor')
    box_encodings = tf.reshape(
        box_encodings, [-1, 1, self._num_classes, self._box_code_size])
    class_predictions_with_background = tf.reshape(
        class_predictions_with_background, [-1, 1, self._num_classes + 1])

    predictions_dict = {
        BOX_ENCODINGS: box_encodings,
        CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background
    }

    if self._predict_instance_masks:
      with slim.arg_scope(self._conv_hyperparams):
        upsampled_features = slim.conv2d_transpose(
            image_features,
            num_outputs=self._mask_prediction_conv_depth,
            kernel_size=[2, 2],
            stride=2)
        mask_predictions = slim.conv2d(upsampled_features,
                                       num_outputs=self.num_classes,
                                       activation_fn=None,
                                       kernel_size=[1, 1])
        instance_masks = tf.expand_dims(tf.transpose(mask_predictions,
                                                     perm=[0, 3, 1, 2]),
                                        axis=1,
                                        name='MaskPredictor')
      predictions_dict[MASK_PREDICTIONS] = instance_masks
    return predictions_dict

这里的box_encodings和class_predictions_with_background，就是最后的预测结果。

更新

在nas_feature_extractor中

  def test_extract_box_classifier_features_returns_expected_size(self):
    feature_extractor = self._build_feature_extractor(
        first_stage_features_stride=16)
    proposal_feature_maps = tf.random_uniform(
        [2, 17, 17, 1088], maxval=255, dtype=tf.float32)
    proposal_classifier_features = (
        feature_extractor.extract_box_classifier_features(
            proposal_feature_maps, scope='TestScope'))
    features_shape = tf.shape(proposal_classifier_features)

    init_op = tf.global_variables_initializer()
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
      self.assertAllEqual(features_shape_out, [2, 9, 9, 4032])

在resent_v1_feature_extractor中

  def test_extract_box_classifier_features_returns_expected_size(self):
    feature_extractor = self._build_feature_extractor(
        first_stage_features_stride=16)
    proposal_feature_maps = tf.random_uniform(
        [3, 7, 7, 1024], maxval=255, dtype=tf.float32)
    proposal_classifier_features = (
        feature_extractor.extract_box_classifier_features(
            proposal_feature_maps, scope='TestScope'))
    features_shape = tf.shape(proposal_classifier_features)

    init_op = tf.global_variables_initializer()
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
      self.assertAllEqual(features_shape_out, [3, 7, 7, 2048])

可以看到在test_extract_box_classifier_features_returns_size的测试文件中，两个特征提取的模型的输出尺寸不一样。nas被最大池化stride=2，而resnet没有进行池化。本人在这里纠结了好一阵子。。。。。哎。。。。。

好几次错过了最重要的点。。。。

关于在第二阶段中_extract_box_classifier_features出来的尺寸问题。在查看源码的过程中，由于忽略了最后一层的全连接，所以一直纠结_extract_box_classifier_features的stride=2或stride=1。输入到_extract_box_classifier_features是通过box在feature map上剪裁出来的feature map尺度是大小一样的，_extract_box_classifier_features只是对剪裁过的feature map进行编码，记住剪裁过的feature map最后有两个输出一个是box location 和classes score 。box_predictor就是作用接住_extract_box_classifier_features的输出并产生box location 和classes score输出。

假如在自定义一个feature extractor时在这里_extract_box_classifier_features对输入的尺寸的改变不会影响最后的全连接层的输出。也就是说，大胆修改feature extractor的尺寸，其他的地方不需要动也可以构建一个新的模型，这估计也是这API比较好的地方。

更新10.30

_extract_box_classifier_features和_extract_proposal_features的输出尺寸在对faster rcnn 中其实没多大影响。

只需要记住的是_extract_proposal_features的尺寸是原图的1/8 或者 1/16.
_extract_box_classifier_features的输出尺寸你可以任意，这里的任意是构建新的模型时候任意，送入的原始图片还是要固定的，_extract_box_classifier_features输出可以是[None, 7,7,1024] 也可以是[None, 28,28,128]等等。因为_extract_box_classifier_features的输出会被拉平，送到
box_predictor中。

现在大胆使用非Google官方提供的feature_extractor，虽然效果没有官方提供的那么好。

object_detectionAPI源码阅读笔记（14-重新整理faster rcnn流程）

object_detectionAPI源码阅读笔记（14-重新整理faster rcnn流程）

再次梳理Faster RCNN的流程

更新

更新10.30

推荐阅读更多精彩内容