简单版本
百度云源码和权重:
链接:https://pan.baidu.com/s/1SkQDp9_5WAe59KJGRVJVwQ
提取码:zwsc
github地址:
https://github.com/xiaohu2015/DeepLearning_tutorials/tree/master/ObjectDetections/SSD
权重地址:
https://link.zhihu.com/?target=https%3A//pan.baidu.com/s/1snhuTsT
这是另外一份关于SSD检测的代码,只有测试功能,想要快速了解SSD,这是一份不错的上手代码。
先 从入口函数开始
SSD_demo.py
这里的demo我很喜欢,封装的很好,主函数的简单。
"""
SSD demo
"""
import cv2
import numpy as np
import tensorflow as tf
import matplotlib.image as mpimg
from ssd_300_vgg import SSD
from utils import preprocess_image, process_bboxes
from visualization import plt_bboxes
def main():
# 【1】搭建网络-->解码网络输出-->设置图片的占位节点
ssd_net = SSD() # 搭建网络:ssd300_vgg
classes, scores, bboxes = ssd_net.detections() # 设置分数阈值,解码网络输出得到bbox的类别、得分(概率)、边界框位置和大小
images = ssd_net.images() # 设置图片的占位节点:images是一个tf.placeholder
# 【2】导入SSD模型
sess = tf.Session()
ckpt_filename = './ssd_checkpoints/ssd_vgg_300_weights.ckpt'
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess, ckpt_filename)
# 【3】预处理图片-->处理预测边界框bboxes
img = cv2.imread('./demo/dog.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 预处理图片:
# 1、白化;
# 2、resize300*300;
# 3、增加batchsize这个维度.
img_prepocessed = preprocess_image(img)
# 将预处理好的图片赋给图片的占位节点
rclasses, rscores, rbboxes = sess.run([classes, scores, bboxes], feed_dict={images: img_prepocessed})
# 处理预测边界框:
# 1、cut the box:将边界框超出整张图片(0,0)—(300,300)的部分cut掉;
# 2、按类别置信度scores降序,对边界框进行排序并仅保留top_k=400;
# 3、计算IOU-->NMS;
# 4、根据先验框anchor调整预测边界框的大小.
rclasses, rscores, rbboxes = process_bboxes(rclasses, rscores, rbboxes)
# 【4】可视化最终的检测结果
plt_bboxes(img, rclasses, rscores, rbboxes)
print('SSD detection has done!')
if __name__ == '__main__':
main()
读取和预处理图片:
1、白化;
2、resize300*300;
3、增加batchsize这个维度.
img = cv2.imread('./demo/dog.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_prepocessed = preprocess_image(img)
首先构建SSD网络,根据tensorflow的特点,classes, scores, bboxes是占位符。classes, scores, bboxes只是经过初步筛选的值,scores大于一个分数阀值。
ssd_net = SSD() # 搭建网络:ssd300_vgg
classes, scores, bboxes = ssd_net.detections() # 设置分数阈值,解码网络输出得到bbox的类别、得分(概率)、边界框位置和大小
images = ssd_net.images() # 设置图片的占位节点:images是一个tf.placeholder
计算网络的值:
rclasses, rscores, rbboxes = sess.run([classes, scores, bboxes], feed_dict={images: img_prepocessed})
处理模型输出结果:
1、cut the box:将边界框超出整张图片(0,0)—(300,300)的部分cut掉;
2、按类别置信度scores降序,对边界框进行排序并仅保留top_k=400;
3、计算IOU-->NMS;
4、根据先验框anchor调整预测边界框的大小.
rclasses, rscores, rbboxes = process_bboxes(rclasses, rscores, rbboxes)
ssd_300_vgg.py
现在进入ssd_300_vgg.py,探究一下SSD模块。
代码开头定义SSDParams的数据数据结构点这里查看namedtuple:
SSDParams = namedtuple('SSDParameters', ['img_shape', # the input image size: 300x300
'num_classes', # number of classes: 20+1
'no_annotation_label',
'feat_layers', # list of names of layer for detection
'feat_shapes', # list of feature map sizes of layer for detection
'anchor_size_bounds', # the down and upper bounds of anchor sizes
'anchor_sizes', # list of anchor sizes of layer for detection
'anchor_ratios', # list of rations used in layer for detection
'anchor_steps', # list of cell size (pixel size) of layer for detection
'anchor_offset', # the center point offset
'normalizations', # list of normalizations of layer for detection
'prior_scaling' #
])
SSD的初始化函数
class SSD(object):
# 构造函数
def __init__(self,is_training=True):
self.is_training = is_training
self.threshold = 0.5 # class score类别分数阈值
self.ssd_params = SSDParams(img_shape=(300,300),
num_classes=21,
no_annotation_label=21,
feature_layers=['block4','block7','block8','block9','block10','block11'],
feature_shapes=[(38,38),(19,19),(10,10),(5,5),(3,3),(1,1)],
anchor_size_bounds=[0.15, 0.90], # diff from the original paper
anchor_sizes=[(21.,45.),(45.,99.),(99.,153.),
(153.,207.),(207.,261.),(261.,315.)],
anchor_ratios=[[2, .5],[2, .5, 3, 1. / 3],[2, .5, 3, 1. / 3],
[2, .5, 3, 1. / 3],[2, .5],[2, .5]],
anchor_steps=[8, 16, 32, 64, 100, 300],
anchor_offset=0.5,
normalizations=[20, -1, -1, -1, -1, -1],
prior_scaling=[0.1, 0.1, 0.2, 0.2]
)
predictions,locations = self._built_net() # 【1】SSD300的网络结构(输入图片为300)
# self._update_feature_shapes_from_net()
classes,scores,bboxes = self._bboxes_select(predictions,locations) # 【2、3】解码网络输出,并筛选边界框
self._classes = classes # 类别
self._scores = scores # 得分(概率)
self._bboxes = bboxes # 预测边界框的位置和大小
在这里把网路的参数设置好。
使用_built_net()构建网络框架
predictions,locations = self._built_net() # 【1】SSD300的网络结构(输入图片为300)
使用_bboxes_select()选择predictions大于阀值的bboxes。这是初步筛选。会在demo中进行NMS的进行精细的筛选。
classes,scores,bboxes = self._bboxes_select(predictions,locations) # 【2、3】解码网络输出,并筛选边界框
现在进入_built_net()
def _built_net(self):
self.end_points = {} # 记录detection layers输出
# 输入图片的占位节点(固定大小的占位)
self._images = tf.placeholder(tf.float32,
shape=[None,self.ssd_params.img_shape[0],self.ssd_params.img_shape[1],3])
with tf.variable_scope('ssd_300_vgg'): # 注意:"ssd_300_vgg"不能修改,否则导入的模型会找不到
# (1)原来经典的vgg layers
# block 1
net = conv2d(self._images, filters=64, kernel_size=3, scope='conv1_1')
net = conv2d(net, 64, 3, scope='conv1_2')
self.end_points['block1'] = net
net = max_pool2d(net, pool_size=2, scope='pool1')
# block 2
net = conv2d(net, 128, 3, scope='conv2_1')
net = conv2d(net, 128, 3, scope='conv2_2')
self.end_points['block2'] = net
net = max_pool2d(net, 2, scope='pool2')
# block 3
net = conv2d(net, 256, 3, scope="conv3_1")
net = conv2d(net, 256, 3, scope="conv3_2")
net = conv2d(net, 256, 3, scope="conv3_3")
self.end_points["block3"] = net
net = max_pool2d(net, 2, scope="pool3")
# block 4
net = conv2d(net, 512, 3, scope="conv4_1")
net = conv2d(net, 512, 3, scope="conv4_2")
net = conv2d(net, 512, 3, scope="conv4_3")
self.end_points["block4"] = net
net = max_pool2d(net, 2, scope="pool4")
# block 5
net = conv2d(net, 512, 3, scope="conv5_1")
net = conv2d(net, 512, 3, scope="conv5_2")
net = conv2d(net, 512, 3, scope="conv5_3")
self.end_points["block5"] = net
print(net)
net = max_pool2d(net, pool_size=3, stride=1, scope="pool5") # 'pool核'大小为3*3,步长为1
print(net)
# (2)后添加的SSD layers
# block 6:使用空洞卷积(带膨胀系数的dilate conv)
net = conv2d(net, filters=1024, kernel_size=3, dilation_rate=6, scope='conv6')
self.end_points['block6'] = net
# net = dropout(net, is_training=self.is_training)
# block 7
net = conv2d(net, 1024, 1, scope='conv7')
self.end_points['block7'] = net
# block 8
net = conv2d(net, 256, 1, scope='conv8_1x1')
net = conv2d(pad2d(net,1), 512, 3, stride=2, scope='conv8_3x3', padding='valid')
self.end_points['block8'] = net
# block 9
net = conv2d(net, 128, 1, scope="conv9_1x1")
net = conv2d(pad2d(net, 1), 256, 3, stride=2, scope="conv9_3x3", padding="valid")
self.end_points["block9"] = net
# block 10
net = conv2d(net, 128, 1, scope="conv10_1x1")
net = conv2d(net, 256, 3, scope="conv10_3x3", padding="valid")
self.end_points["block10"] = net
# block 11
net = conv2d(net, 128, 1, scope="conv11_1x1")
net = conv2d(net, 256, 3, scope="conv11_3x3", padding="valid")
self.end_points["block11"] = net
# class和location的预测值
predictions = []
locations = []
for i, layer in enumerate(self.ssd_params.feature_layers):
# layer=self.ssd_params.feature_layers=['block4','block7','block8','block9','block10','block11']
cls, loc = ssd_multibox_layer(self.end_points[layer], self.ssd_params.num_classes,
self.ssd_params.anchor_sizes[i],
self.ssd_params.anchor_ratios[i],
self.ssd_params.normalizations[i],
scope=layer + '_box')
predictions.append(tf.nn.softmax(cls)) # 解码class得分:用softmax函数
locations.append(loc) # 解码边界框位置xywh
return predictions, locations
这里的predictions就是6个feature map的box坐标最后预测结果shape=[None, w, h, n_anchors, 4],
locations是6个feature map的classes的预测结果[None, w, h, n_anchors, num_classes]。
- 这个model的从
conv1_1
到conv11_3x3
都是卷积。在函数的尾部使用ssd_multibox_layer()从收集feature map中生成box坐标(x,y,w,h)和类别(21) - 从由Conv4_3,Conv7,Conv8_2,Conv9_2,Conv10_2,Conv11_2特征图经过卷积得到的最后detection layer得获取边界框的类别classes、位置location的预测值。
def ssd_multibox_layer(x,num_classes,sizes,ratios,normalization=-1,scope='multibox'):
pre_shape = x.get_shape().as_list()[1:-1] # 去除第一个和最后一个得到shape
pre_shape = [-1] + pre_shape
with tf.variable_scope(scope):
# l2 norm
if normalization > 0:
x = l2norm(x,normalization)
print(x)
# anchors数量
n_anchors = len(sizes) + len(ratios)
# locations位置预测值
loc_pred = conv2d(x,filters=n_anchors*4,kernel_size=3,activation=None,scope='conv_loc') # 一个anchor用4个量表示位置、大小
loc_pred = tf.reshape(loc_pred,pre_shape + [n_anchors,4]) # [anchor数量,每个anchor的locations信息]
# class类别预测值
cls_pred = conv2d(x,filters=n_anchors*num_classes,kernel_size=3,activation=None,scope='conv_cls')
cls_pred = tf.reshape(cls_pred,pre_shape + [n_anchors,num_classes]) # [anchor数量,每个anchor的class信息]
return cls_pred,loc_pred
这里的输出loc_pred.shape = [None, w, h, n_anchors, 4].
cls_pred.shape=[None, w, h, n_anchors, num_classes]。这里None是batch size.。
接下来就是_bboxes_select()函数:
classes,scores,bboxes = self._bboxes_select(predictions,locations) # 【2、3】解码网络输出,并筛选边界框
_bboxes_select()
def _bboxes_select(self,predictions,locations):
anchor_bboxes_list = self.anchors()
classes_list = []
scores_list = []
bboxes_list = []
# 对每个feature layer选择bboxes:循环调用上面的筛选原则
for n in range(len(predictions)):
anchor_bboxes = list(map(tf.convert_to_tensor,anchor_bboxes_list[n]))
classes,scores,bboxes = self._bboxes_select_layer(predictions[n],locations[n],
anchor_bboxes,self.ssd_params.prior_scaling)
classes_list.append(classes)
scores_list.append(scores)
bboxes_list.append(bboxes)
# 整合所有的feature layer筛选的边界框结果
classes = tf.concat(classes_list, axis=0)
scores = tf.concat(scores_list, axis=0)
bboxes = tf.concat(bboxes_list, axis=0)
return classes, scores, bboxes
这里是找到所有的box:
anchor_bboxes_list = self.anchors()
那就跳到anchors()
# 获取SSD的anchor
def anchors(self):
return ssd_anchors_all_layers(self.ssd_params.img_shape,
self.ssd_params.feature_shapes,
self.ssd_params.anchor_sizes,
self.ssd_params.anchor_ratios,
self.ssd_params.anchor_steps,
self.ssd_params.anchor_offset,
np.float32)
再跳ssd_anchors_all_layers()
def ssd_anchors_all_layers(img_shape,
layers_shape,
anchor_sizes,
anchor_ratios,
anchor_steps,
offset=0.5,
dtype=np.float32):
"""Compute anchor boxes for all feature layers.
"""
layers_anchors = []
for i, s in enumerate(layers_shape):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
return layers_anchors
再跳ssd_anchor_one_layer()
def ssd_anchor_one_layer(img_shape,
feat_shape,
sizes,
ratios,
step,
offset=0.5,
dtype=np.float32):
# Compute the position grid: simple way.
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
# y = (y.astype(dtype) + offset) / feat_shape[0]
# x = (x.astype(dtype) + offset) / feat_shape[1]
# Weird SSD-Caffe computation using steps values...
# (y+offset)/feat_shape
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
# 得到feature map 中每点的个中心点,这个中心点是行对于整个图片尺寸的
# 这里 + 0.5是把中兴点放到每个各格子的中点,每个格子就是feature map的点
# Expand dims to support easy broadcasting.
y = np.expand_dims(y, axis=-1) # [size, size, 1]
x = np.expand_dims(x, axis=-1) # [size, size, 1]
# Compute relative height and width.
# Tries to follow the original implementation of SSD for the order.
num_anchors = len(sizes) + len(ratios)
# 一共有num_anchors个box,这里得到box的最小尺寸
h = np.zeros((num_anchors, ), dtype=dtype) # [n_anchors]
w = np.zeros((num_anchors, ), dtype=dtype) # [n_anchors]
# Add first anchor boxes with ratio=1.
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
# 每个点的box的尺寸,归一化
di = 1
if len(sizes) > 1:
# 一共有num_anchors个box,这里得到box的最大尺寸
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios):
# 一共有num_anchors个box,这里得到所有box的尺寸
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
# 返回所有的box的(x,y,w,h),这里的尺寸都是归一化的尺寸
return y, x, h, w
这里的输入参数可以和输入对应起来:
- img_shape=(300,300)
- feat_shape=[(38,38),(19,19),(10,10),(5,5),(3,3),(1,1)]
- sizes=[(21.,45.),(45.,99.),(99.,153.), (153.,207.),(207.,261.),(261.,315.)],这里是feature map上box的[min,max]尺寸。
- ratios=[[2, .5],[2, .5, 3, 1. / 3],[2, .5, 3, 1. / 3],[2, .5, 3, 1. / 3],[2, .5],[2, .5]]
- step=[8, 16, 32, 64, 100, 300],每个点的视野是8
ssd_anchor_one_layer是生成固定box的(y,x,h,w),这里的尺寸均是除以image_w,image_h。所以这里的(y,x,h,w)不会超过1。
- 开始返回ssd_anchors_all_layers()
回到ssd_anchors_all_layers就可以,发现
layers_anchors.append(anchor_bboxes)
这里的layers_anchors拿到所有的feature map 的box,在vgg中有8732个box。
再次返回_bboxes_select()中:
anchor_bboxes_list = self.anchors()
这里的anchor_bboxes_list拿到所有的feature map 的box,在vgg中有8732个box。
在_bboxes_select()中可以按照predictions的分数是否大于阀值,对box进行以粗糙的筛选。
for n in range(len(predictions)):
anchor_bboxes = list(map(tf.convert_to_tensor,anchor_bboxes_list[n]))
classes,scores,bboxes = self._bboxes_select_layer(predictions[n],locations[n],
anchor_bboxes,self.ssd_params.prior_scaling)
classes_list.append(classes)
scores_list.append(scores)
bboxes_list.append(bboxes)
# 整合所有的feature layer筛选的边界框结果
classes = tf.concat(classes_list, axis=0)
scores = tf.concat(scores_list, axis=0)
bboxes = tf.concat(bboxes_list, axis=0)
这里的_bboxes_select_layer()
从最后的特征图中筛选1次边界框bboxes原则(仅针对batchsize=1):最大类别得分score>阈值
def _bboxes_select_layer(self,feature_predictions,feature_locations,anchor_bboxes,prior_scaling):
# bboxes的个数=网络输出的shape之间的乘积
n_bboxes = np.product(feature_predictions.get_shape().as_list()[1:-1])
# 解码边界框位置location
bboxes = self._bboxes_decode_layer(feature_locations,anchor_bboxes,prior_scaling)
bboxes = tf.reshape(bboxes,[n_bboxes,4]) # [边界框bboxes数量,每个bbox的位置和大小]
predictions = tf.reshape(feature_predictions,[n_bboxes,self.ssd_params.num_classes]) # [边界框bboxes数量,每个bbox的类别得分]
# 移除背景的得分num_class预测值
sub_predictions = predictions[:,1:]
# 筛选最大的类别分数
classes = tf.argmax(sub_predictions,axis=1) + 1 # 类别labels:最大的类别分数索引。(因为背景在第一个索引位置,故后面+1)
scores = tf.reduce_max(sub_predictions,axis=1) # 最大类别得分max_class scores
# ※※※筛选边界框bbox:最大类别得分>阈值(只用了第二个原则)※※※
filter_mask = scores > self.threshold # 变成bool类型的向量:True留下、False去除
classes = tf.boolean_mask(classes,filter_mask)
scores = tf.boolean_mask(scores,filter_mask)
bboxes = tf.boolean_mask(bboxes,filter_mask)
return classes,scores,bboxes
解析一下输入参数:
feature_locations= 所有的box的classes预测,shape=[None, w, h, n_anchors, 4].
feature_predictions =所有的box的坐标(x,y,w,h)是shape=[None, w, h, n_anchors, num_classes]
anchor_bboxes = 所有的box的固定[8732, w, h, n_anchors, 4].
prior_scaling=[0.1, 0.1, 0.2, 0.2]
这里的引入了_bboxes_decode_layer()函数:
把网络的预测feature_locations与网络每个feature map的固定anchor_bboxes进行解码得到网络输出,得到边界框位置和大小。
def _bboxes_decode_layer(self,feature_locations,anchor_bboxes,prior_scaling): # prior_scaling:先验尺寸
y_a,x_a,h_a,w_a = anchor_bboxes
print(y_a)
# 解码:由anchor计算真实的cx/cy/w/h
cx = feature_locations[:,:,:,:,0] * w_a * prior_scaling[0] + x_a
cy = feature_locations[:,:,:,:,1] * h_a * prior_scaling[1] + y_a
w = w_a * tf.exp(feature_locations[:,:,:,:,2] * prior_scaling[2])
h = h_a * tf.exp(feature_locations[:,:,:,:,3] * prior_scaling[3])
# cx/cy/w/h --> ymin/xmin/ymax/xmax
bboxes = tf.stack([cy-h/2.0,cx-w/2.0,cy+h/2.0,cx+w/2.0], axis=-1)
# shape为[batch_size, size, size, n_anchors, 4]
return bboxes
注意这里的bboxes在这里被解码为
shape = [ymin,xmin,ymax,xmax].
这里解码bboxes是原图的尺寸的归一化尺寸(ymin是在原图的比例),公式在SSD笔记的坐标变换。
输出的bboxes.shape = [batch_size, size, size, n_anchors, 4]
到这里SSD模块就结束了。
返回ssd_demo.py
处理预测边界框:
1、cut the box:将边界框超出整张图片(0,0)—(300,300)的部分cut掉;
2、按类别置信度scores降序,对边界框进行排序并仅保留top_k=400;
3、计算IOU-->NMS;
4、根据先验框anchor调整预测边界框的大小.
rclasses, rscores, rbboxes = process_bboxes(rclasses, rscores, rbboxes)
跳到process_bboxes()
def process_bboxes(rclasses, rscores, rbboxes, rbbox_img = (0.0, 0.0, 1.0, 1.0),
top_k=400, nms_threshold=0.5):
# 【1】cut the box:将边界框超出整张图片(0,0)—(300,300)的部分cut掉
rbboxes = bboxes_clip(rbbox_img, rbboxes)
# 【2】按类别置信度scores降序,对边界框进行排序并仅保留top_k=400
rclasses, rscores, rbboxes = bboxes_sort(rclasses, rscores, rbboxes, top_k)
# 【3】计算IOU-->NMS
rclasses, rscores, rbboxes = bboxes_nms(rclasses, rscores, rbboxes, nms_threshold)
# 【4】根据先验框anchor调整预测边界框的大小
rbboxes = bboxes_resize(rbbox_img, rbboxes)
return rclasses, rscores, rbboxes
这里bboxes_clip()函数只是把box的图片的大小调整,裁剪了box中大小超出边界部分,并未删除box。
def bboxes_clip(bbox_ref, bboxes):
"""Clip bounding boxes with respect to reference bbox."""
bboxes = np.copy(bboxes)
bboxes = np.transpose(bboxes)
bbox_ref = np.transpose(bbox_ref)
bboxes[0] = np.maximum(bboxes[0], bbox_ref[0]) # xmin
bboxes[1] = np.maximum(bboxes[1], bbox_ref[1]) # ymin
bboxes[2] = np.minimum(bboxes[2], bbox_ref[2]) # xmax
bboxes[3] = np.minimum(bboxes[3], bbox_ref[3]) # ymax
bboxes = np.transpose(bboxes)
return bboxes
使用bboxes_sort()按类别置信度scores降序,对边界框进行排序并仅保留top_k=400
rclasses, rscores, rbboxes = bboxes_sort(rclasses, rscores, rbboxes, top_k)
看看bboxes_sort()
def bboxes_sort(classes, scores, bboxes, top_k=400):
"""Sort bounding boxes by decreasing order and keep only the top_k."""
idxes = np.argsort(-scores) # 把scores降序排列
classes = classes[idxes][:top_k]
scores = scores[idxes][:top_k]
bboxes = bboxes[idxes][:top_k]
return classes, scores, bboxes
这是超级简单的一个函数,提取前top_k个值
再看看怎么使用极大值抑制。
计算IOU-->NMS
rclasses, rscores, rbboxes = bboxes_nms(rclasses, rscores, rbboxes, nms_threshold)
看看bboxes_nms
先看看怎么计算IOU,就是比较两个box的交集占并集的比例
这里使用了扩张技术,box1是一个坐标,但box2是一个坐标集合,numpy还是有点强大的。
# 计算IOU
def bboxes_iou(bboxes1, bboxes2):
bboxes1 = np.transpose(bboxes1)
bboxes2 = np.transpose(bboxes2)
# 计算两个box的交集:交集左上角的点取两个box的max,交集右下角的点取两个box的min
int_ymin = np.maximum(bboxes1[0], bboxes2[0])
int_xmin = np.maximum(bboxes1[1], bboxes2[1])
int_ymax = np.minimum(bboxes1[2], bboxes2[2])
int_xmax = np.minimum(bboxes1[3], bboxes2[3])
# 计算两个box交集的wh:如果两个box没有交集,那么wh为0(按照计算方式wh为负数,跟0比较取最大值)
int_h = np.maximum(int_ymax - int_ymin, 0.)
int_w = np.maximum(int_xmax - int_xmin, 0.)
# 计算IOU
int_vol = int_h * int_w # 交集面积
vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积
vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积
iou = int_vol / (vol1 + vol2 - int_vol) # IOU=交集/并集
return iou
进入bboxes_nms
def bboxes_nms(classes, scores, bboxes, nms_threshold=0.5):
"""Apply non-maximum selection to bounding boxes."""
keep_bboxes = np.ones(scores.shape, dtype=np.bool)
for i in range(scores.size-1):
if keep_bboxes[i]:
# Computer overlap with bboxes which are following.
overlap = bboxes_iou(bboxes[i], bboxes[(i+1):])
# Overlap threshold for keeping + checking part of the same class
keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i])
keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap)
idxes = np.where(keep_bboxes)
return classes[idxes], scores[idxes], bboxes[idxes]
解释一下,防止遗忘
keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i])
keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap)
- 1.保留i个的情况下,进入2和3.
- 2.对于第i个box,与i类别不一致或者与i的iou小于阀值的box保留,再进入3.
- 3.更新所有的box的保留情况,循环并跳过未保留的box,直到最后一个box。
- 提取所有的box
注意:在开始时,这里一共还有scores.size个box保留,至于scores.size-1,是因为每次的i会在后面提取(i+1:),如keep_bboxes[(i+1):],是防止超出索引。
根据先验框anchor调整预测边界框的大小
再回到process_bboxes,调整box的大小
rbboxes = bboxes_resize(rbbox_img, rbboxes)
process_bboxes如下
根据先验框anchor调整预测边界框的大小,返回original image shape。但是由于这里使用的是归一化的表示方式。
def bboxes_resize(bbox_ref, bboxes):
"""Resize bounding boxes based on a reference bounding box,
assuming that the latter is [0, 0, 1, 1] after transform.
"""
bboxes = np.copy(bboxes)
# Translate.
bboxes[:, 0] -= bbox_ref[0]
bboxes[:, 1] -= bbox_ref[1]
bboxes[:, 2] -= bbox_ref[0]
bboxes[:, 3] -= bbox_ref[1]
# Resize.
resize = [bbox_ref[2] - bbox_ref[0], bbox_ref[3] - bbox_ref[1]]
bboxes[:, 0] /= resize[0]
bboxes[:, 1] /= resize[1]
bboxes[:, 2] /= resize[0]
bboxes[:, 3] /= resize[1]
return bboxes
回到ssd_demo.py
我认为需要看看plt_bboxes的
因为从这里我们才能知道 [ymin,xmin,ymax,xmax]是怎么解码到原图的。
plt_bboxes(img, rclasses, rscores, rbboxes)
如下:
def plt_bboxes(img, classes, scores, bboxes, figsize=(10,10), linewidth=1.5, show_class_name=True):
"""Visualize bounding boxes. Largely inspired by SSD-MXNET!"""
fig = plt.figure(figsize=figsize)
plt.imshow(img)
height = img.shape[0]
width = img.shape[1]
colors = dict()
for i in range(classes.shape[0]):
cls_id = int(classes[i])
if cls_id >= 0:
score = scores[i]
if cls_id not in colors:
colors[cls_id] = (random.random(), random.random(), random.random())
ymin = int(bboxes[i, 0] * height)
xmin = int(bboxes[i, 1] * width)
ymax = int(bboxes[i, 2] * height)
xmax = int(bboxes[i, 3] * width)
rect = plt.Rectangle((xmin, ymin), xmax - xmin,
ymax - ymin, fill=False,
edgecolor=colors[cls_id],
linewidth=linewidth)
plt.gca().add_patch(rect)
class_name = CLASSES[cls_id-1] if show_class_name else str(cls_id)
plt.gca().text(xmin, ymin - 2,
'{:s} | {:.3f}'.format(class_name, score),
bbox=dict(facecolor=colors[cls_id], alpha=0.5),
fontsize=12, color='white')
#plt.savefig('./SSD_data/detection.jpg') # 保存检测后的图
plt.show()
从上面可以看出[ymin,xmin,ymax,xmax]是原图的占比,也就是把原图的宽高看作为1,[ymin,xmin,ymax,xmax]是0到1之间的一个数,表示占比。
在补充两个内容就是ssd_layer.py文件中
ssd_layers.py
在ssd_layers.py中有一个l2norm函数,这是加载conv2d函数上的。
def l2norm(x,scale,trainable=True,scope='L2Normalization'):
n_channels = x.get_shape().as_list()[-1] # 通道数
l2_norm = tf.nn.l2_normalize(x,dim=[3],epsilon=1e-12) # 只对每个像素点在channels上做归一化
with tf.variable_scope(scope):
gamma = tf.get_variable("gamma", shape=[n_channels, ], dtype=tf.float32,
initializer=tf.constant_initializer(scale),
trainable=trainable)
return l2_norm * gamma
l2norm:Conv4_3层将作为用于检测的第一个特征图,该层比较靠前,其norm较大,所以在其后面增加了一个L2 Normalization层,以保证和后面的检测层差异不是很大.这个和Batch Normalization层不太一样:其仅仅是对每个像素点在channle维度做归一化,归一化后一般设置一个可训练的放缩变量gamma.而Batch Normalization层是在[batch_size, width, height]三个维度上做归一化。
同时在函数conv2d中还有一个操作,就是卷积采用扩展卷积或带孔卷积。其在不增加参数与模型复杂度的条件下指数级扩大卷积的视野,其使用扩张率(dilation rate)参数,来表示扩张的大
def conv2d(x,filters,kernel_size,stride=1,padding='same',
dilation_rate=1,activation=tf.nn.relu,scope='conv2d'):
kernel_sizes = [kernel_size] * 2 # --> [kernel_size,kernel_size]
strides = [stride] * 2 # --> [stride,stride]
dilation_rate = [dilation_rate] * 2 # 膨胀率-->[dilation_rate,dilation_rate]
return tf.layers.conv2d(inputs=x,filters=filters,kernel_size=kernel_sizes,
strides=strides,dilation_rate=dilation_rate,padding=padding,
name=scope,activation=activation)
这里的dilation_rate就是膨胀率,扩张卷积的参数,当膨胀率=1时,就是普通卷积。
到这里差不多了
感觉有很多问题没考虑,以后再慢慢看看吧。