边界框回归偏移量的编码和解码版本不同

Encoded and decoded version of bouding box regression offsets are different

我正在尝试复制给定的 faster-rcnn 中使用的边界框回归技术 here。我做了一个解码函数和一个编码函数。理想情况下,当将边界框传递给编码器然后对其进行解码时,我应该得到相同的边界框。

这是我的输入边界框:

import numpy as np
import tensorflow as tf

def make_anchors(img_size, conv_h, conv_w, scale, aspect_ratios):
    prior_data = []
    # Iteration order is important (it has to sync up with the convout)
    for j, i in product(range(conv_h), range(conv_w)):
        # + 0.5 because priors are in center
        x = (i + 0.5) / conv_w
        y = (j + 0.5) / conv_h

        for ar in aspect_ratios:
            ar = sqrt(ar)
            w = scale * ar / img_size
            h = scale / ar / img_size
            
            prior_data += [x, y, w, h]

    return prior_data

test_bbox = tf.convert_to_tensor((np.array([[204.044, 253.8351, 487.8226, 427.06363],
                                            [0, 140.01741, 550, 290.21936],
                                            [40.005028, 117.37102, 255.7913, 205.13097],
                                            [263.31314, 67.0434, 514.04736, 124.48139],
                                            [0, 503.79834, 487.0279, 550]])), dtype=tf.float32)

test_labels = tf.convert_to_tensor((np.array([[1],
                                              [2],
                                              [3],
                                              [4],
                                              [5]])), dtype=tf.float32)

feature_map_size=[[69,69], [35,35], [18,18], [9,9], [5,5]]
aspect_ratios=[1, 0.5, 2]
scales=[24, 48, 96, 192, 384]
anchors = []

for i, shape in enumerate(feature_map_size):
    anchors += make_anchors(550, shape[0], shape[1], scales[i], aspect_ratios)
    
anchors = tf.reshape(tf.convert_to_tensor(anchors), [-1, 4])

我使用 550x550 图像大小作为输入并相应地计算特征图大小。

编码完成如下:

def encode(map_loc, center_anchors, include_variances=False):
    # center_gt = tf.map_fn(lambda x: map_to_center_form(x), map_loc)
    h = map_loc[:, 2] - map_loc[:, 0]
    w = map_loc[:, 3] - map_loc[:, 1]
    center_gt = tf.cast(tf.stack([map_loc[:, 1] + (w / 2), map_loc[:, 0] + (h / 2), w, h], axis=-1), tf.float32)
    variances = [0.1, 0.2]

    # calculate offset
    if include_variances:
        g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2] / variances[0]
        g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3] / variances[0]
    else:
        g_hat_cx = (center_gt[:, 0] - center_anchors[:, 0]) / center_anchors[:, 2]
        g_hat_cy = (center_gt[:, 1] - center_anchors[:, 1]) / center_anchors[:, 3]
    tf.debugging.assert_non_negative(center_anchors[:, 2] / center_gt[:, 2])
    tf.debugging.assert_non_negative(center_anchors[:, 3] / center_gt[:, 3])
    if include_variances:
        g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2]) / variances[1]
        g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3]) / variances[1]
    else:
        g_hat_w = tf.math.log(center_gt[:, 2] / center_anchors[:, 2])
        g_hat_h = tf.math.log(center_gt[:, 3] / center_anchors[:, 3])
    offsets = tf.stack([g_hat_cx, g_hat_cy, g_hat_w, g_hat_h], axis=-1)
    return offsets

def area(boxlist, scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L48
    """Computes area of boxes.
    Args:
    boxlist: BoxList holding N boxes
    scope: name scope.
    Returns:
    a tensor with shape [N] representing box areas.
    """
    y_min, x_min, y_max, x_max = tf.split(
        value=boxlist, num_or_size_splits=4, axis=1)
    return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])

def intersection(boxlist1, boxlist2, scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L209
    """Compute pairwise intersection areas between boxes.
    Args:
    boxlist1: BoxList holding N boxes
    boxlist2: BoxList holding M boxes
    scope: name scope.
    Returns:
    a tensor with shape [N, M] representing pairwise intersections
    """
    y_min1, x_min1, y_max1, x_max1 = tf.split(
        value=boxlist1, num_or_size_splits=4, axis=1)
    y_min2, x_min2, y_max2, x_max2 = tf.split(
        value=boxlist2, num_or_size_splits=4, axis=1)
    all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
    all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
    intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
    all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
    all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
    intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
    return intersect_heights * intersect_widths

def iou(boxlist1, boxlist2, scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/box_list_ops.py#L259
    """Computes pairwise intersection-over-union between box collections.
    Args:
    boxlist1: BoxList holding N boxes
    boxlist2: BoxList holding M boxes
    scope: name scope.
    Returns:
    a tensor with shape [N, M] representing pairwise iou scores.
    """
    intersections = intersection(boxlist1, boxlist2)
    areas1 = area(boxlist1)
    areas2 = area(boxlist2)
    unions = (
        tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
    return tf.where(
        tf.equal(intersections, 0.0),
        tf.zeros_like(intersections), tf.truediv(intersections, unions))

def matching(pos_thresh, neg_thresh, gt_bbox, gt_labels, priors):
    pairwise_iou = iou(priors, gt_bbox) # # size: [num_objects, num_priors]; anchors along the row and ground_truth clong the columns

    each_prior_max = tf.reduce_max(pairwise_iou, axis=-1) # size [num_priors]; iou with ground truth with the anchors
    each_prior_index = tf.math.argmax(pairwise_iou, axis=-1) # size [num_priors]; id of groud truth having max iou with the anchors

    each_box_max = tf.reduce_max(pairwise_iou, axis=0)
    each_box_index = tf.math.argmax(pairwise_iou, axis=0)

    # For the max IoU prior for each gt box, set its IoU to 2. This ensures that it won't be filtered
    # in the threshold step even if the IoU is under the negative threshold. This is because that we want
    # at least one prior to match with each gt box or else we'd be wasting training data.

    indices = tf.expand_dims(each_box_index,axis=-1)

    updates = tf.cast(tf.tile(tf.constant([2]), each_box_index.shape), dtype=tf.float32)
    each_prior_max = tf.tensor_scatter_nd_update(each_prior_max, indices, updates)

    # Set the index of the pair (prior, gt) we set the overlap for above.
    updates = tf.cast(tf.range(0,each_box_index.shape),dtype=tf.int64)
    each_prior_index = tf.tensor_scatter_nd_update(each_prior_index, indices, updates)

    each_prior_box = tf.gather(gt_bbox, each_prior_index) # size: [num_priors, 4]
    conf = tf.squeeze(tf.gather(gt_labels, each_prior_index) + 1) # the class of the max IoU gt box for each prior, size: [num_priors]


    neutral_label_index = tf.where(each_prior_max < pos_thresh)
    background_label_index = tf.where(each_prior_max < neg_thresh)

    conf = tf.tensor_scatter_nd_update(conf, neutral_label_index, -1*tf.ones(tf.size(neutral_label_index)))
    conf = tf.tensor_scatter_nd_update(conf, background_label_index, tf.zeros(tf.size(background_label_index)))

    offsets = encode(each_prior_box, priors)

    return offsets, conf, each_prior_box, each_prior_index


offsets, conf, each_prior_box, each_prior_index = \
    matching(0.5, 0.5, test_bbox/550, test_labels, anchors)

如果我尝试重绘编码后得到的偏移量,我得到的图像如下:


def _decode(box_p, priors, include_variances=False):
    # https://github.com/feiyuhuahuo/Yolact_minimal/blob/9299a0cf346e455d672fadd796ac748871ba85e4/utils/box_utils.py#L151
    """
    Decode predicted bbox coordinates using the scheme
    employed at https://lilianweng.github.io/lil-log/2017/12/31/object-recognition-for-dummies-part-3.html
        b_x = prior_w*loc_x + prior_x
        b_y = prior_h*loc_y + prior_y
        b_w = prior_w * exp(loc_w)
        b_h = prior_h * exp(loc_h)
    
    Note that loc is inputed as [c_x, x_y, w, h]
    while priors are inputed as [c_x, c_y, w, h] where each coordinate
    is relative to size of the image.
    
    Also note that prior_x and prior_y are center coordinates.
    """
    variances = [0.1, 0.2]
    box_p = tf.cast(box_p, tf.float32)
    priors = tf.cast(priors, tf.float32)
    if include_variances:
        b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]* variances[0]
        b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:]* variances[1])
    else:
        b_x_y = priors[:, :2] + box_p[:, :2] * priors[:, 2:]
        b_w_h = priors[:, 2:] * tf.math.exp(box_p[:, 2:])
    
    boxes = tf.concat([b_x_y, b_w_h], axis=1)
    
    # [x_min, y_min, x_max, y_max]
    boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] + boxes[:, :2]], axis=1)
    
    # [y_min, x_min, y_max, x_max]
    return tf.transpose(tf.stack([boxes[:, 1], boxes[:, 0],boxes[:, 3], boxes[:, 2]]))


_idx = tf.where(conf > 0.5)
_test = _decode(offsets, anchorobj.anchors)
_out = tf.squeeze(tf.gather(_test, _idx)).numpy()*550
img_test = 255*np.ones((1000,1000,3), dtype=np.int8)

for box in _out:
    box = np.round(box).astype(int)
    image = cv2.rectangle(img_test, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 2)
plt.imshow(image)

如您所见,输出超过了 550 的图像输入尺寸。为什么会这样?

问题出在我的 decode 函数计算 [x_min, y_min, x_max, y_max] 中。应该是这样的:

# [x_min, y_min, x_max, y_max]
    boxes = tf.concat([boxes[:, :2] - boxes[:, 2:] / 2, boxes[:, 2:] / 2 + boxes[:, :2]], axis=1)