边界框回归偏移量的编码和解码版本不同
Encoded and decoded version of bouding box regression offsets are different
我正在尝试复制给定的 faster-rcnn 中使用的边界框回归技术 here。我做了一个解码函数和一个编码函数。理想情况下,当将边界框传递给编码器然后对其进行解码时,我应该得到相同的边界框。
这是我的输入边界框:
import numpy as np
import tensorflow as tf
def make_anchors(img_size, conv_h, conv_w, scale, aspect_ratios):
prior_data = []
# Iteration order is important (it has to sync up with the convout)
for j, i in product(range(conv_h), range(conv_w)):
# + 0.5 because priors are in center
x = (i + 0.5) / conv_w
y = (j + 0.5) / conv_h
for ar in aspect_ratios:
ar = sqrt(ar)
w = scale * ar / img_size
h = scale / ar / img_size
prior_data += [x, y, w, h]
return prior_data
test_bbox = tf.convert_to_tensor((np.array([[204.044, 253.8351, 487.8226, 427.06363],
[0, 140.01741, 550, 290.21936],
[40.005028, 117.37102, 255.7913, 205.13097],
[263.31314, 67.0434, 514.04736, 124.48139],
[0, 503.79834, 487.0279, 550]])), dtype=tf.float32)
test_labels = tf.convert_to_tensor((np.array([[1],
[2],
[3],
[4],
[5]])), dtype=tf.float32)
feature_map_size=[[69,69], [35,35], [18,18], [9,9], [5,5]]
aspect_ratios=[1, 0.5, 2]
scales=[24, 48, 96, 192, 384]
anchors = []
for i, shape in enumerate(feature_map_size):
anchors += make_anchors(550, shape[0], shape[1], scales[i], aspect_ratios)
anchors = tf.reshape(tf.convert_to_tensor(anchors), [-1, 4])
我使用 550x550 图像大小作为输入并相应地计算特征图大小。
编码完成如下:
def encode(map_loc, center_anchors, include_variances=False):
# center_gt = tf.map_fn(lambda x: map_to_center_form(x), map_loc)
h = map_loc[:, 2] - map_loc[:, 0]
w = map_loc[:, 3] - map_loc[:, 1]
center_gt = tf.cast(tf.stack([map_loc[:, 1] + (w / 2), map_loc[:, 0] + (h / 2), w, h], axis=-1), tf.float32)
variances = [0.1, 0.2]
# calculate offset
if include_variances:
g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2] / variances[0]
g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3] / variances[0]
else:
g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2]
g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3]
tf.debugging.assert_non_negative(center_anchors[:, 2] / center_gt[:, 2])
tf.debugging.assert_non_negative(center_anchors[:, 3] / center_gt[:, 3])
if include_variances:
g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2]) / variances[1]
g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3]) / variances[1]
else:
g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2])
g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3])
offsets = tf.stack([g_hat_cx, g_hat_cy, g_hat_w, g_hat_h], axis=-1)
return offsets
def area(boxlist, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L48
"""Computes area of boxes.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing box areas.
"""
y_min, x_min, y_max, x_max = tf.split(
value=boxlist, num_or_size_splits=4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
def intersection(boxlist1, boxlist2, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L209
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1, num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2, num_or_size_splits=4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
def iou(boxlist1, boxlist2, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L259
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
intersections = intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
def matching(pos_thresh, neg_thresh, gt_bbox, gt_labels, priors):
pairwise_iou = iou(priors, gt_bbox) # # size: [num_objects, num_priors]; anchors along the row and ground_truth clong the columns
each_prior_max = tf.reduce_max(pairwise_iou, axis=-1) # size [num_priors]; iou with ground truth with the anchors
each_prior_index = tf.math.argmax(pairwise_iou, axis=-1) # size [num_priors]; id of groud truth having max iou with the anchors
each_box_max = tf.reduce_max(pairwise_iou, axis=0)
each_box_index = tf.math.argmax(pairwise_iou, axis=0)
# For the max IoU prior for each gt box, set its IoU to 2. This ensures that it won't be filtered
# in the threshold step even if the IoU is under the negative threshold. This is because that we want
# at least one prior to match with each gt box or else we'd be wasting training data.
indices = tf.expand_dims(each_box_index,axis=-1)
updates = tf.cast(tf.tile(tf.constant([2]), each_box_index.shape), dtype=tf.float32)
each_prior_max = tf.tensor_scatter_nd_update(each_prior_max, indices, updates)
# Set the index of the pair (prior, gt) we set the overlap for above.
updates = tf.cast(tf.range(0,each_box_index.shape),dtype=tf.int64)
each_prior_index = tf.tensor_scatter_nd_update(each_prior_index, indices, updates)
each_prior_box = tf.gather(gt_bbox, each_prior_index) # size: [num_priors, 4]
conf = tf.squeeze(tf.gather(gt_labels, each_prior_index) + 1) # the class of the max IoU gt box for each prior, size: [num_priors]
neutral_label_index = tf.where(each_prior_max < pos_thresh)
background_label_index = tf.where(each_prior_max < neg_thresh)
conf = tf.tensor_scatter_nd_update(conf, neutral_label_index, -1*tf.ones(tf.size(neutral_label_index)))
conf = tf.tensor_scatter_nd_update(conf, background_label_index, tf.zeros(tf.size(background_label_index)))
offsets = encode(each_prior_box, priors)
return offsets, conf, each_prior_box, each_prior_index
offsets, conf, each_prior_box, each_prior_index = \
matching(0.5, 0.5, test_bbox/550, test_labels, anchors)
如果我尝试重绘编码后得到的偏移量,我得到的图像如下:
def _decode(box_p, priors, include_variances=False):
# https://github.com/feiyuhuahuo/Yolact_minimal/blob/9299a0cf346e455d672fadd796ac748871ba85e4/utils/box_utils.py#L151
"""
Decode predicted bbox coordinates using the scheme
employed at https://lilianweng.github.io/lil-log/2017/12/31/object-recognition-for-dummies-part-3.html
b_x = prior_w*loc_x + prior_x
b_y = prior_h*loc_y + prior_y
b_w = prior_w * exp(loc_w)
b_h = prior_h * exp(loc_h)
Note that loc is inputed as [c_x, x_y, w, h]
while priors are inputed as [c_x, c_y, w, h] where each coordinate
is relative to size of the image.
Also note that prior_x and prior_y are center coordinates.
"""
variances = [0.1, 0.2]
box_p = tf.cast(box_p, tf.float32)
priors = tf.cast(priors, tf.float32)
if include_variances:
b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]* variances[0]
b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:]* variances[1])
else:
b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]
b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:])
boxes = tf.concat([b_x_y, b_w_h], axis=1)
# [x_min, y_min, x_max, y_max]
boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] + boxes[:, :2]], axis=1)
# [y_min, x_min, y_max, x_max]
return tf.transpose(tf.stack([boxes[:, 1], boxes[:, 0],boxes[:, 3], boxes[:, 2]]))
_idx = tf.where(conf > 0.5)
_test = _decode(offsets, anchorobj.anchors)
_out = tf.squeeze(tf.gather(_test, _idx)).numpy()*550
img_test = 255*np.ones((1000,1000,3), dtype=np.int8)
for box in _out:
box = np.round(box).astype(int)
image = cv2.rectangle(img_test, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 2)
plt.imshow(image)
如您所见,输出超过了 550 的图像输入尺寸。为什么会这样?
问题出在我的 decode
函数计算 [x_min, y_min, x_max, y_max]
中。应该是这样的:
# [x_min, y_min, x_max, y_max]
boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] / 2 + boxes[:, :2]], axis=1)
我正在尝试复制给定的 faster-rcnn 中使用的边界框回归技术 here。我做了一个解码函数和一个编码函数。理想情况下,当将边界框传递给编码器然后对其进行解码时,我应该得到相同的边界框。
这是我的输入边界框:
import numpy as np
import tensorflow as tf
def make_anchors(img_size, conv_h, conv_w, scale, aspect_ratios):
prior_data = []
# Iteration order is important (it has to sync up with the convout)
for j, i in product(range(conv_h), range(conv_w)):
# + 0.5 because priors are in center
x = (i + 0.5) / conv_w
y = (j + 0.5) / conv_h
for ar in aspect_ratios:
ar = sqrt(ar)
w = scale * ar / img_size
h = scale / ar / img_size
prior_data += [x, y, w, h]
return prior_data
test_bbox = tf.convert_to_tensor((np.array([[204.044, 253.8351, 487.8226, 427.06363],
[0, 140.01741, 550, 290.21936],
[40.005028, 117.37102, 255.7913, 205.13097],
[263.31314, 67.0434, 514.04736, 124.48139],
[0, 503.79834, 487.0279, 550]])), dtype=tf.float32)
test_labels = tf.convert_to_tensor((np.array([[1],
[2],
[3],
[4],
[5]])), dtype=tf.float32)
feature_map_size=[[69,69], [35,35], [18,18], [9,9], [5,5]]
aspect_ratios=[1, 0.5, 2]
scales=[24, 48, 96, 192, 384]
anchors = []
for i, shape in enumerate(feature_map_size):
anchors += make_anchors(550, shape[0], shape[1], scales[i], aspect_ratios)
anchors = tf.reshape(tf.convert_to_tensor(anchors), [-1, 4])
我使用 550x550 图像大小作为输入并相应地计算特征图大小。
编码完成如下:
def encode(map_loc, center_anchors, include_variances=False):
# center_gt = tf.map_fn(lambda x: map_to_center_form(x), map_loc)
h = map_loc[:, 2] - map_loc[:, 0]
w = map_loc[:, 3] - map_loc[:, 1]
center_gt = tf.cast(tf.stack([map_loc[:, 1] + (w / 2), map_loc[:, 0] + (h / 2), w, h], axis=-1), tf.float32)
variances = [0.1, 0.2]
# calculate offset
if include_variances:
g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2] / variances[0]
g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3] / variances[0]
else:
g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2]
g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3]
tf.debugging.assert_non_negative(center_anchors[:, 2] / center_gt[:, 2])
tf.debugging.assert_non_negative(center_anchors[:, 3] / center_gt[:, 3])
if include_variances:
g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2]) / variances[1]
g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3]) / variances[1]
else:
g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2])
g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3])
offsets = tf.stack([g_hat_cx, g_hat_cy, g_hat_w, g_hat_h], axis=-1)
return offsets
def area(boxlist, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L48
"""Computes area of boxes.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing box areas.
"""
y_min, x_min, y_max, x_max = tf.split(
value=boxlist, num_or_size_splits=4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
def intersection(boxlist1, boxlist2, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L209
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1, num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2, num_or_size_splits=4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
def iou(boxlist1, boxlist2, scope=None):
# https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L259
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
intersections = intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
def matching(pos_thresh, neg_thresh, gt_bbox, gt_labels, priors):
pairwise_iou = iou(priors, gt_bbox) # # size: [num_objects, num_priors]; anchors along the row and ground_truth clong the columns
each_prior_max = tf.reduce_max(pairwise_iou, axis=-1) # size [num_priors]; iou with ground truth with the anchors
each_prior_index = tf.math.argmax(pairwise_iou, axis=-1) # size [num_priors]; id of groud truth having max iou with the anchors
each_box_max = tf.reduce_max(pairwise_iou, axis=0)
each_box_index = tf.math.argmax(pairwise_iou, axis=0)
# For the max IoU prior for each gt box, set its IoU to 2. This ensures that it won't be filtered
# in the threshold step even if the IoU is under the negative threshold. This is because that we want
# at least one prior to match with each gt box or else we'd be wasting training data.
indices = tf.expand_dims(each_box_index,axis=-1)
updates = tf.cast(tf.tile(tf.constant([2]), each_box_index.shape), dtype=tf.float32)
each_prior_max = tf.tensor_scatter_nd_update(each_prior_max, indices, updates)
# Set the index of the pair (prior, gt) we set the overlap for above.
updates = tf.cast(tf.range(0,each_box_index.shape),dtype=tf.int64)
each_prior_index = tf.tensor_scatter_nd_update(each_prior_index, indices, updates)
each_prior_box = tf.gather(gt_bbox, each_prior_index) # size: [num_priors, 4]
conf = tf.squeeze(tf.gather(gt_labels, each_prior_index) + 1) # the class of the max IoU gt box for each prior, size: [num_priors]
neutral_label_index = tf.where(each_prior_max < pos_thresh)
background_label_index = tf.where(each_prior_max < neg_thresh)
conf = tf.tensor_scatter_nd_update(conf, neutral_label_index, -1*tf.ones(tf.size(neutral_label_index)))
conf = tf.tensor_scatter_nd_update(conf, background_label_index, tf.zeros(tf.size(background_label_index)))
offsets = encode(each_prior_box, priors)
return offsets, conf, each_prior_box, each_prior_index
offsets, conf, each_prior_box, each_prior_index = \
matching(0.5, 0.5, test_bbox/550, test_labels, anchors)
如果我尝试重绘编码后得到的偏移量,我得到的图像如下:
def _decode(box_p, priors, include_variances=False):
# https://github.com/feiyuhuahuo/Yolact_minimal/blob/9299a0cf346e455d672fadd796ac748871ba85e4/utils/box_utils.py#L151
"""
Decode predicted bbox coordinates using the scheme
employed at https://lilianweng.github.io/lil-log/2017/12/31/object-recognition-for-dummies-part-3.html
b_x = prior_w*loc_x + prior_x
b_y = prior_h*loc_y + prior_y
b_w = prior_w * exp(loc_w)
b_h = prior_h * exp(loc_h)
Note that loc is inputed as [c_x, x_y, w, h]
while priors are inputed as [c_x, c_y, w, h] where each coordinate
is relative to size of the image.
Also note that prior_x and prior_y are center coordinates.
"""
variances = [0.1, 0.2]
box_p = tf.cast(box_p, tf.float32)
priors = tf.cast(priors, tf.float32)
if include_variances:
b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]* variances[0]
b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:]* variances[1])
else:
b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]
b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:])
boxes = tf.concat([b_x_y, b_w_h], axis=1)
# [x_min, y_min, x_max, y_max]
boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] + boxes[:, :2]], axis=1)
# [y_min, x_min, y_max, x_max]
return tf.transpose(tf.stack([boxes[:, 1], boxes[:, 0],boxes[:, 3], boxes[:, 2]]))
_idx = tf.where(conf > 0.5)
_test = _decode(offsets, anchorobj.anchors)
_out = tf.squeeze(tf.gather(_test, _idx)).numpy()*550
img_test = 255*np.ones((1000,1000,3), dtype=np.int8)
for box in _out:
box = np.round(box).astype(int)
image = cv2.rectangle(img_test, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 2)
plt.imshow(image)
如您所见,输出超过了 550 的图像输入尺寸。为什么会这样?
问题出在我的 decode
函数计算 [x_min, y_min, x_max, y_max]
中。应该是这样的:
# [x_min, y_min, x_max, y_max]
boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] / 2 + boxes[:, :2]], axis=1)