再次梳理Faster RCNN的流程
为与Faster R-CNN 入坑之源码阅读结合起来,今天又再次查看源码。这的代码是用的旧版的object detection api。
Faster RCNN是从基模型提取特征,之后再提取box location 和classes scores。
今天就要好好理一理这个流程了。代码超长。
def resnet_v1(inputs,
blocks,
num_classes=None,
is_training=True,
global_pool=True,
output_stride=None,
include_root_block=True,
spatial_squeeze=True,
store_non_strided_activations=False,
reuse=None,
scope=None):
with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
with slim.arg_scope([slim.conv2d, bottleneck,
resnet_utils.stack_blocks_dense],
outputs_collections=end_points_collection):
with (slim.arg_scope([slim.batch_norm], is_training=is_training)
if is_training is not None else NoOpScope()):
net = inputs
if include_root_block:
if output_stride is not None:
if output_stride % 4 != 0:
raise ValueError('The output_stride needs to be a multiple of 4.')
output_stride /= 4
net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
store_non_strided_activations)
# Convert end_points_collection into a dictionary of end_points.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
if global_pool:
# Global average pooling.
net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
end_points['global_pool'] = net
if num_classes:
net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope='logits')
end_points[sc.name + '/logits'] = net
if spatial_squeeze:
net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
end_points[sc.name + '/spatial_squeeze'] = net
end_points['predictions'] = slim.softmax(net, scope='predictions')
return net, end_points
resnet_v1.default_image_size = 224
def resnet_v1_50(inputs,
num_classes=None,
is_training=True,
global_pool=True,
output_stride=None,
spatial_squeeze=True,
store_non_strided_activations=False,
reuse=None,
scope='resnet_v1_50'):
"""ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
blocks = [
resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
resnet_v1_block('block3', base_depth=256, num_units=6, stride=2),
resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
]
return resnet_v1(inputs, blocks, num_classes, is_training,
global_pool=global_pool, output_stride=output_stride,
include_root_block=True, spatial_squeeze=spatial_squeeze,
store_non_strided_activations=store_non_strided_activations,
reuse=reuse, scope=scope)
所以说在resnet_v1_50返回的就是net 和 end_points
在特征提取器里面
def _extract_proposal_features(self, preprocessed_inputs, scope):
if len(preprocessed_inputs.get_shape().as_list()) != 4:
raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
'tensor of shape %s' % preprocessed_inputs.get_shape())
shape_assert = tf.Assert(
tf.logical_and(
tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
['image size must at least be 33 in both height and width.'])
with tf.control_dependencies([shape_assert]):
# Disables batchnorm for fine-tuning with smaller batch sizes.
# TODO: Figure out if it is needed when image batch size is bigger.
with slim.arg_scope(
resnet_utils.resnet_arg_scope(
batch_norm_epsilon=1e-5,
batch_norm_scale=True,
weight_decay=self._weight_decay)):
with tf.variable_scope(
self._architecture, reuse=self._reuse_weights) as var_scope:
_, activations = self._resnet_model(
preprocessed_inputs,
num_classes=None,
is_training=False,
global_pool=False,
output_stride=self._first_stage_features_stride,
spatial_squeeze=False,
scope=var_scope)
handle = scope + '/%s/block3' % self._architecture
return activations[handle]
这里的activations[handle]返回就是net['resnet_v1_50/block3']
在特征提取之后
def _predict_rpn_proposals(self, rpn_box_predictor_features):
num_anchors_per_location = (
self._first_stage_anchor_generator.num_anchors_per_location())
if len(num_anchors_per_location) != 1:
raise RuntimeError('anchor_generator is expected to generate anchors '
'corresponding to a single feature map.')
box_predictions = self._first_stage_box_predictor.predict(
rpn_box_predictor_features,
num_anchors_per_location[0],
scope=self.first_stage_box_predictor_scope)
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (tf.squeeze(box_encodings, axis=2),
objectness_predictions_with_background)
这里的_first_stage_box_predictor调用的是box_predictor.py的ConvolutionalBoxPredictor
def _predict(self, image_features, num_predictions_per_location):
features_depth = static_shape.get_depth(image_features.get_shape())
depth = max(min(features_depth, self._max_depth), self._min_depth)
# Add a slot for the background class.
num_class_slots = self.num_classes + 1
net = image_features
with slim.arg_scope(self._conv_hyperparams), \
slim.arg_scope([slim.dropout], is_training=self._is_training):
# Add additional conv layers before the predictor.
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
with slim.arg_scope([slim.conv2d], activation_fn=None,
normalizer_fn=None, normalizer_params=None):
box_encodings = slim.conv2d(
net, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
scope='BoxEncodingPredictor')
if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d(
net, num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size], scope='ClassPredictor')
if self._apply_sigmoid_to_scores:
class_predictions_with_background = tf.sigmoid(
class_predictions_with_background)
batch_size = static_shape.get_batch_size(image_features.get_shape())
if batch_size is None:
features_height = static_shape.get_height(image_features.get_shape())
features_width = static_shape.get_width(image_features.get_shape())
flattened_predictions_size = (features_height * features_width *
num_predictions_per_location)
box_encodings = tf.reshape(
box_encodings,
[-1, flattened_predictions_size, 1, self._box_code_size])
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[-1, flattened_predictions_size, num_class_slots])
else:
box_encodings = tf.reshape(
box_encodings, [batch_size, -1, 1, self._box_code_size])
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots])
return {BOX_ENCODINGS: box_encodings,
CLASS_PREDICTIONS_WITH_BACKGROUND:
class_predictions_with_background}
这里返回了box_encodings和class_predictions_with_background。
在特征box进行提取之后,由进行了flattened_proposal_feature_maps的特征提取。
box_classifier_features = (
self._feature_extractor.extract_box_classifier_features(
flattened_proposal_feature_maps,
scope=self.second_stage_feature_extractor_scope))
看这里,其实这里没改变特征图的尺寸。
def _extract_box_classifier_features(self, proposal_feature_maps, scope):
with tf.variable_scope(self._architecture, reuse=self._reuse_weights):
with slim.arg_scope(
resnet_utils.resnet_arg_scope(
batch_norm_epsilon=1e-5,
batch_norm_scale=True,
weight_decay=self._weight_decay)):
with slim.arg_scope([slim.batch_norm], is_training=False):
blocks = [
resnet_utils.Block('block4', resnet_v1.bottleneck, [{
'depth': 2048,
'depth_bottleneck': 512,
'stride': 1
}] * 3)
]
proposal_classifier_features = resnet_utils.stack_blocks_dense(
proposal_feature_maps, blocks)
return proposal_classifier_features
在测试文件里面。
def test_extract_box_classifier_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
proposal_feature_maps = tf.random_uniform(
[3, 7, 7, 1024], maxval=255, dtype=tf.float32)
proposal_classifier_features = (
feature_extractor.extract_box_classifier_features(
proposal_feature_maps, scope='TestScope'))
features_shape = tf.shape(proposal_classifier_features)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [3, 7, 7, 2048])
之后就是fast rcnn的精确坐标预测,在_mask_rcnn_box_predictor里面。
box_predictions = self._mask_rcnn_box_predictor.predict(
box_classifier_features,
num_predictions_per_location=1,
scope=self.second_stage_box_predictor_scope)
refined_box_encodings = tf.squeeze(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.squeeze(box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1)
absolute_proposal_boxes = ops.normalized_to_image_coordinates(
proposal_boxes_normalized, image_shape, self._parallel_iterations)
prediction_dict = {
'refined_box_encodings': refined_box_encodings,
'class_predictions_with_background':
class_predictions_with_background,
'num_proposals': num_proposals,
'proposal_boxes': absolute_proposal_boxes,
}
还在box_predictor里面。
def _predict(self, image_features, num_predictions_per_location):
if num_predictions_per_location != 1:
raise ValueError('Currently FullyConnectedBoxPredictor only supports '
'predicting a single box per class per location.')
spatial_averaged_image_features = tf.reduce_mean(image_features, [1, 2],
keep_dims=True,
name='AvgPool')
flattened_image_features = slim.flatten(spatial_averaged_image_features)
if self._use_dropout:
flattened_image_features = slim.dropout(flattened_image_features,
keep_prob=self._dropout_keep_prob,
is_training=self._is_training)
with slim.arg_scope(self._fc_hyperparams):
box_encodings = slim.fully_connected(
flattened_image_features,
self._num_classes * self._box_code_size,
activation_fn=None,
scope='BoxEncodingPredictor')
class_predictions_with_background = slim.fully_connected(
flattened_image_features,
self._num_classes + 1,
activation_fn=None,
scope='ClassPredictor')
box_encodings = tf.reshape(
box_encodings, [-1, 1, self._num_classes, self._box_code_size])
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [-1, 1, self._num_classes + 1])
predictions_dict = {
BOX_ENCODINGS: box_encodings,
CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background
}
if self._predict_instance_masks:
with slim.arg_scope(self._conv_hyperparams):
upsampled_features = slim.conv2d_transpose(
image_features,
num_outputs=self._mask_prediction_conv_depth,
kernel_size=[2, 2],
stride=2)
mask_predictions = slim.conv2d(upsampled_features,
num_outputs=self.num_classes,
activation_fn=None,
kernel_size=[1, 1])
instance_masks = tf.expand_dims(tf.transpose(mask_predictions,
perm=[0, 3, 1, 2]),
axis=1,
name='MaskPredictor')
predictions_dict[MASK_PREDICTIONS] = instance_masks
return predictions_dict
这里的box_encodings和class_predictions_with_background,就是最后的预测结果。
更新
在nas_feature_extractor中
def test_extract_box_classifier_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
proposal_feature_maps = tf.random_uniform(
[2, 17, 17, 1088], maxval=255, dtype=tf.float32)
proposal_classifier_features = (
feature_extractor.extract_box_classifier_features(
proposal_feature_maps, scope='TestScope'))
features_shape = tf.shape(proposal_classifier_features)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [2, 9, 9, 4032])
在resent_v1_feature_extractor中
def test_extract_box_classifier_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
proposal_feature_maps = tf.random_uniform(
[3, 7, 7, 1024], maxval=255, dtype=tf.float32)
proposal_classifier_features = (
feature_extractor.extract_box_classifier_features(
proposal_feature_maps, scope='TestScope'))
features_shape = tf.shape(proposal_classifier_features)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [3, 7, 7, 2048])
可以看到在test_extract_box_classifier_features_returns_size的测试文件中,两个特征提取的模型的输出尺寸不一样。nas被最大池化stride=2,而resnet没有进行池化。本人在这里纠结了好一阵子。。。。。哎。。。。。
好几次错过了最重要的点。。。。
关于在第二阶段中_extract_box_classifier_features出来的尺寸问题。在查看源码的过程中,由于忽略了最后一层的全连接,所以一直纠结_extract_box_classifier_features的stride=2或stride=1。输入到_extract_box_classifier_features是通过box在feature map上剪裁出来的feature map尺度是大小一样的,_extract_box_classifier_features只是对剪裁过的feature map进行编码,记住剪裁过的feature map最后有两个输出一个是box location 和classes score 。box_predictor就是作用接住_extract_box_classifier_features的输出并产生box location 和classes score输出。
假如在自定义一个feature extractor时在这里_extract_box_classifier_features对输入的尺寸的改变不会影响最后的全连接层的输出。也就是说,大胆修改feature extractor的尺寸,其他的地方不需要动也可以构建一个新的模型,这估计也是这API比较好的地方。
更新10.30
_extract_box_classifier_features和_extract_proposal_features的输出尺寸在对faster rcnn 中其实没多大影响。
- 只需要记住的是_extract_proposal_features的尺寸是原图的1/8 或者 1/16.
- _extract_box_classifier_features的输出尺寸你可以任意,这里的任意是构建新的模型时候任意,送入的原始图片还是要固定的,_extract_box_classifier_features输出可以是[None, 7,7,1024] 也可以是[None, 28,28,128]等等。因为_extract_box_classifier_features的输出会被拉平,送到
box_predictor中。
现在大胆使用非Google官方提供的feature_extractor,虽然效果没有官方提供的那么好。