proposal_target_layer.py (FPN)
bbox_weights_all不知道后面怎么用。。。
layer {
name: 'roi-data'type: 'Python'
bottom: 'rpn_rois'
bottom: 'gt_boxes'
bottom: 'data'
top: 'rois/h2'
top: 'rois/h3'
top: 'rois/h4'
top: 'rois/h5'
top: 'labels'
top: 'bbox_targets'
top: 'bbox_inside_weights'
top: 'bbox_outside_weights'
python_param {
module: 'rpn.proposal_target_layer'
layer: 'ProposalTargetLayer'
param_str: "'num_classes': 21"
}
}
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
import caffe
import yaml
import numpy as np
import numpy.random as npr
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
import matplotlib
matplotlib.use('Agg')
DEBUG = False
def vis_all_detection(im_array, detections, class_names, scale):
"""
visualize all detections in one image
:param im_array: [b=1 c h w] in rgb
:param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ]
:param class_names: list of names in imdb
:param scale: visualize the scaled image
:return:
"""
# print im_array.shape
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
import random
a = [103.06 ,115.9 ,123.15]
a = np.array(a)
im = transform_inverse(im_array,a)
plt.imshow(im)
for j in range(detections.shape[0]):
# if class_names[j] == 0:
# continue
color = (random.random(), random.random(), random.random()) # generate a random color
dets = detections[j]
det =dets
bbox = det[0:]
score = det[0]
rect = plt.Rectangle((bbox[0], bbox[1]),
bbox[2] - bbox[0],
bbox[3] - bbox[1], fill=False,
edgecolor=color, linewidth=3.5)
plt.gca().add_patch(rect)
# plt.gca().text(bbox[0], bbox[1] - 2,
# '{:s} {:.3f}'.format(str(class_names[j]), score),
# bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white')
plt.show()
name = np.mean(im)
savefig ('vis/'+str(name)+'.png')
plt.clf()
plt.cla()
plt. close(0)
def transform_inverse(im_tensor, pixel_means):
"""
transform from mxnet im_tensor to ordinary RGB image
im_tensor is limited to one image
:param im_tensor: [batch, channel, height, width]
:param pixel_means: [B, G, R pixel means]
:return: im [height, width, channel(RGB)]
"""
assert im_tensor.shape[0] == 1
im_tensor = im_tensor.copy()
# put channel back
channel_swap = (0, 2, 3, 1)
im_tensor = im_tensor.transpose(channel_swap)
im = im_tensor[0]
assert im.shape[2] == 3
im += pixel_means[[2, 1, 0]]
im = im.astype(np.uint8)
return im
class ProposalTargetLayer(caffe.Layer):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
"""
def setup(self, bottom, top):
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']
self._batch_rois = cfg.TRAIN.BATCH_SIZE
# sampled rois (0, x1, y1, x2, y2)
top[0].reshape(1, 5)
top[1].reshape(1, 5)
top[2].reshape(1, 5)
top[3].reshape(1, 5)
# labels
top[4].reshape(1, 1)
# bbox_targets
top[5].reshape(1, self._num_classes * 4)
# bbox_inside_weights
top[6].reshape(1, self._num_classes * 4)
# bbox_outside_weights
top[7].reshape(1, self._num_classes * 4)
def forward(self, bottom, top):
# Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
# (i.e., rpn.proposal_layer.ProposalLayer), or any other source
proposal layer生成的所有proposal
all_rois = bottom[0].data
aaa = all_rois[:]# GT boxes (x1, y1, x2, y2, label)
# TODO(rbg): it's annoying that sometimes I have extra info before
# and other times after box coordinates -- normalize to one format
gt_boxes = bottom[1].data 来自roi-data-layer
im = bottom[2].data 来自roi-data-layer
# Include ground-truth boxes in the set of candidate rois
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
all_rois = np.vstack(
(all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) proposal + gt
)
num_images = 1
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images BATCH_SIZE 默认256
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
rois, labels, bbox_targets, bbox_weights ,layer_indexs = _sample_rois(
all_rois, gt_boxes, fg_rois_per_image,
rois_per_image, self._num_classes,sample_type='fpn', k0 = 4) 返回保留的前景/背景,他们对应的gt分类,他们与对应gt的偏移,bbox_weights(通过_get_bbox_regression_labels),如果是fpn还要返回一个根据图片尺寸得到的layer_index
vis =False
if vis:
ind = np.where(labels!=0)[0]
im_shape = im.shape
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (21, 1)).ravel()
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (21, 1)).ravel()
bbox_targets = bbox_targets*stds +means
pred_boxes = bbox_transform_inv(rois[:,1:], bbox_targets)
pred_boxes = clip_boxes(pred_boxes, im_shape[-2:])
l =labels[ind]
ro = rois[ind,1:]
b = bbox_targets[ind,:]
p = pred_boxes[ind,:]*bbox_weights[ind,:]
r = []
for i in range(p.shape[0]):
r.append(p[i,l[i]*4:l[i]*4+4])
r_ = np.vstack(r)
# Optionally normalize targets by a precomputed mean and stdev
vis_all_detection(im, aaa[:,1:], l, 1)
rois_ = np.zeros((self._batch_rois*4, 5), dtype=rois.dtype)
labels_all = np.ones((self._batch_rois*4, ), dtype=labels.dtype)*-1
bbox_targets_all = np.zeros((self._batch_rois*4, self._num_classes * 4), dtype=bbox_targets.dtype)
bbox_weights_all = np.zeros((self._batch_rois*4, self._num_classes * 4), dtype=bbox_weights.dtype)
rois_all =[]
for i in range(4): 对应2,3,4,5层
index = (layer_indexs == (i + 2))
num_index = sum(index) 计数?
start = self._batch_rois*i 每层预留_batch_rois(256)个位置
end = start+num_index
index_range = range(start, end)
rois_[index_range, :] = rois[index, :] 填入对应层的proposal(从0开始顺序填入,之间不留空)
rois_all.append(rois_[range(start,start + self._batch_rois), :])
labels_all[index_range] = labels[index] 填入对应层的label
bbox_targets_all[index_range,:] = bbox_targets[index, :] 填入对应层的target
bbox_weights_all[index_range,:] = bbox_weights[index, :] 填入对应层的weight
不同scale的proposal输出到不同的roi-pooling层,因为尺寸不一样所以要分别做pooling;然后将每层结果concat到一起,与label,target,weight求loss
rois_p2 = rois_all[0]rois_p3 = rois_all[1]
rois_p4 = rois_all[2]
rois_p5 = rois_all[3]
label, target, weight是所有层放一起
top[0].reshape(*rois_p2.shape)
top[0].data[...] = rois_p2
top[1].reshape(*rois_p3.shape)
top[1].data[...] = rois_p3
top[2].reshape(*rois_p4.shape)
top[2].data[...] = rois_p4
top[3].reshape(*rois_p5.shape)
top[3].data[...] = rois_p5
# classification labels
top[4].reshape(*labels_all.shape)
top[4].data[...] = labels_all
# bbox_targets
top[5].reshape(*bbox_targets_all.shape)
top[5].data[...] = bbox_targets_all
# bbox_inside_weights
top[6].reshape(*bbox_weights_all.shape)
top[6].data[...] = bbox_weights_all
# bbox_outside_weights
top[7].reshape(*bbox_weights_all.shape)
top[7].data[...] = np.array(bbox_weights_all > 0).astype(np.float32)
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
def _get_bbox_regression_labels(bbox_target_data, num_classes):
"""Bounding-box regression targets (bbox_target_data) are stored in a
compact form N x (class, tx, ty, tw, th)
This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets).
Returns:
bbox_target (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
clss = bbox_target_data[:, 0]
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
inds = np.where(clss > 0)[0]
for ind in inds:
cls = clss[ind]
start = 4 * cls
end = start + 4
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 默认(1.0, 1.0, 1.0, 1.0)
return bbox_targets, bbox_inside_weights
def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
取gt和proposal的偏移
targets = bbox_transform(ex_rois, gt_rois)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Optionally normalize targets by a precomputed mean and stdev
targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
/ np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
return np.hstack(
(labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes,sample_type='fpn', k0 = 4):
"""Generate a random sample of RoIs comprising foreground and background
examples.
"""
# overlaps: (rois x gt_boxes)
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
每个proposal最大iou对应的gt的index
gt_assignment = overlaps.argmax(axis=1)max_overlaps = overlaps.max(axis=1)
每个proposal最大iou对应的gt的分类
# Select foreground RoIs as those with >= FG_THRESH overlap
fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 取前景
# Guard against the case when an image has fewer than fg_rois_per_image
# foreground RoIs
fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
# Sample foreground regions without replacement
if fg_inds.size > 0:
fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) 筛前景
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 取背景
(max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# Compute number of background RoIs to take from this image (guarding
# against there being fewer than desired)
bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
# Sample background regions without replacement
if bg_inds.size > 0:
bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
只保留前景+背景
# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
labels[fg_rois_per_this_image:] = 0
rois = all_rois[keep_inds]
"""Compute bounding-box regression targets for an image."""计算保留下来proposal与他们各自对应的gt的偏移,结果的第一列是label 后4列是偏移
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
通过上面的bbox_target_data生成新格式的矩阵 列扩展为包含所有类别,但只有每个proposal各自对应的gt有值
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
if sample_type == 'fpn':
#print 0
w = (rois[:,3]-rois[:,1])
h = (rois[:,4]-rois[:,2])
s = w * h
s[s<=0]=1e-6
layer_index = np.floor(k0+np.log2(np.sqrt(s)/224))
layer_index[layer_index<2]=2
layer_index[layer_index>5]=5
#print 1
return rois, labels, bbox_targets, bbox_inside_weights, layer_index #rois:[512,5] labels:[512,]
else:
return rois, labels, bbox_targets, bbox_inside_weights