Tensorflow 对象检测 API:自定义 VGG 16 模型
Tensorflow object detection API: Custom VGG 16 model
我正在创建自定义 VGG 模型作为 Tensorflow 对象检测中 Faster RCNN 模型的特征提取器 API。如文档 https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/defining_your_own_model.md 中所述,特征提取器代码由 extract_proposal_features
和 extract_classifier_features
组成。我正在使用创建卷积层的 TF slim 代码(因为 Tensorflow 团队使用它)。作为参考,请找到由 TF slim
返回的 VGG 16 的模型结构
([('vgg_16/conv1/conv1_1',
<tf.Tensor 'vgg_16/vgg_16/conv1/conv1_1/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
('vgg_16/conv1/conv1_2',
<tf.Tensor 'vgg_16/vgg_16/conv1/conv1_2/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
('vgg_16/vgg_16/pool1',
<tf.Tensor 'vgg_16/vgg_16/pool1/MaxPool:0' shape=(?, 112, 112, 64) dtype=float32>),
('vgg_16/conv2/conv2_1',
<tf.Tensor 'vgg_16/vgg_16/conv2/conv2_1/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
('vgg_16/conv2/conv2_2',
<tf.Tensor 'vgg_16/vgg_16/conv2/conv2_2/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
('vgg_16/vgg_16/pool2',
<tf.Tensor 'vgg_16/vgg_16/pool2/MaxPool:0' shape=(?, 56, 56, 128) dtype=float32>),
('vgg_16/conv3/conv3_1',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_1/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/conv3/conv3_2',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_2/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/conv3/conv3_3',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_3/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/vgg_16/pool3',
<tf.Tensor 'vgg_16/vgg_16/pool3/MaxPool:0' shape=(?, 28, 28, 256) dtype=float32>),
('vgg_16/conv4/conv4_1',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_1/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/conv4/conv4_2',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_2/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/conv4/conv4_3',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_3/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/vgg_16/pool4',
<tf.Tensor 'vgg_16/vgg_16/pool4/MaxPool:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_1',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_1/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_2',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_2/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_3',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_3/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/vgg_16/pool5',
<tf.Tensor 'vgg_16/vgg_16/pool5/MaxPool:0' shape=(?, 7, 7, 512) dtype=float32>),
('vgg_16/fc6',
<tf.Tensor 'vgg_16/vgg_16/fc6/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>),
('vgg_16/fc7',
<tf.Tensor 'vgg_16/vgg_16/fc7/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>)])
我的问题是,在extract_proposal_features
方法中需要包含和返回哪些卷积层,在extract_classifier_features
方法中需要包含和返回哪些卷积层。请告诉我。
我更改了 vgg slim 代码以获得正确的张量。
def vgg_16(inputs,
num_classes=1000,
is_training=True,
dropout_keep_prob=0.5,
spatial_squeeze=True,
scope='vgg_16',
fc_conv_padding='VALID',
global_pool=False):
"""Oxford Net VGG 16-Layers version D Example.
Note: All the fully_connected layers have been transformed to conv2d layers.
To use in classification mode, resize input to 224x224.
Args:
inputs: a tensor of size [batch_size, height, width, channels].
num_classes: number of predicted classes. If 0 or None, the logits layer is
omitted and the input features to the logits layer are returned instead.
is_training: whether or not the model is being trained.
dropout_keep_prob: the probability that activations are kept in the dropout
layers during training.
spatial_squeeze: whether or not should squeeze the spatial dimensions of the
outputs. Useful to remove unnecessary dimensions for classification.
scope: Optional scope for the variables.
fc_conv_padding: the type of padding to use for the fully connected layer
that is implemented as a convolutional layer. Use 'SAME' padding if you
are applying the network in a fully convolutional manner and want to
get a prediction map downsampled by a factor of 32 as an output.
Otherwise, the output prediction map will be (input / 32) - 6 in case of
'VALID' padding.
global_pool: Optional boolean flag. If True, the input to the classification
layer is avgpooled to size 1x1, for any input size. (This is not part
of the original VGG architecture.)
Returns:
net: the output of the logits layer (if num_classes is a non-zero integer),
or the input to the logits layer (if num_classes is 0 or None).
end_points: a dict of tensors with intermediate activations.
"""
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
outputs_collections=end_points_collection):
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
net = slim.max_pool2d(net, [2, 2], scope='pool1')
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
net = slim.max_pool2d(net, [2, 2], scope='pool4')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
net = slim.max_pool2d(net, [2, 2], scope='pool5')
# Convert end_points_collection into a end_point dict.
end_points = slim.utils.convert_collection_to_dict(end_points_collection)
end_points['head'] = net
# Use conv2d instead of fully_connected layers.
net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout6')
net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
if global_pool:
net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
end_points['global_pool'] = net
if num_classes:
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout7')
net = slim.conv2d(net, num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
scope='fc8')
if spatial_squeeze and num_classes is not None:
net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
end_points[sc.name + '/fc8'] = net
return net, end_points
end_points['head']
= net 是用于 extract_proposal_features.
的张量
def _extract_proposal_features(self, preprocessed_inputs, scope):
"""Extracts first stage RPN features.
Args:
preprocessed_inputs: A [batch, height, width, channels] float32 tensor
representing a batch of images.
scope: A scope name.
Returns:
rpn_feature_map: A tensor with shape [batch, height, width, depth]
Raises:
InvalidArgumentError: If the spatial size of `preprocessed_inputs`
(height or width) is less than 33.
ValueError: If the created network is missing the required activation.
"""
preprocessed_inputs.get_shape().assert_has_rank(4)
shape_assert = tf.Assert(
tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
['image size must at least be 33 in both height and width.'])
with tf.control_dependencies([shape_assert]):
with tf.variable_scope('vgg_16', 'vgg_16', reuse=self._reuse_weights):
_, activations = vgg.vgg_16(
preprocessed_inputs,
scope=scope)
return activations['head']
和
def _extract_box_classifier_features(self, proposal_feature_maps, scope):
"""Extracts second stage box classifier features.
Args:
proposal_feature_maps: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, crop_height, crop_width, depth]
representing the feature map cropped to each proposal.
scope: A scope name (unused).
Returns:
proposal_classifier_features: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, height, width, depth]
representing box classifier features for each proposal.
"""
net = proposal_feature_maps
with tf.variable_scope('vgg_16', reuse=self._reuse_weights):
with slim.arg_scope(
[slim.conv2d],
stride=1,
padding='VALID'):
# Use conv2d instead of fully_connected layers.
fc6 = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
if self._is_training:
fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True,
scope='dropout6')
fc7 = slim.conv2d(fc6, 4096, [1, 1], scope='fc7')
if self._is_training:
fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True,
scope='dropout7')
proposal_classifier_features = fc7
return proposal_classifier_features
我喜欢这个。不知道方法对不对:)
这是我的测试代码。
import numpy as np
import tensorflow as tf
from models import faster_rcnn_vgg_16_feature_extractor as faster_rcnn_vgg_16
class FasterRcnnVgg16FeatureExtractorTest(tf.test.TestCase):
def _build_feature_extractor(self, first_stage_features_stride):
return faster_rcnn_vgg_16.FasterRCNNVgg16FeatureExtractor(
is_training=False,
first_stage_features_stride=first_stage_features_stride,
weight_decay=0.0005)
def test_extract_proposal_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[4, 224, 224, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
def test_extract_proposal_features_stride_eight(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=8)
preprocessed_inputs = tf.random_uniform(
[4, 224, 224, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
def test_extract_proposal_features_half_size_input(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[1, 112, 112, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [1, 4, 4, 512])
def test_extract_proposal_features_dies_on_invalid_stride(self):
with self.assertRaises(ValueError):
self._build_feature_extractor(first_stage_features_stride=99)
def test_extract_proposal_features_dies_on_very_small_images(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
with self.assertRaises(tf.errors.InvalidArgumentError):
sess.run(
features_shape,
feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)})
def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[224, 224, 3], maxval=255, dtype=tf.float32)
with self.assertRaises(ValueError):
feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
def test_extract_box_classifier_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
proposal_feature_maps = tf.random_uniform(
[3, 7, 7, 512], maxval=255, dtype=tf.float32)
proposal_classifier_features = (
feature_extractor.extract_box_classifier_features(
proposal_feature_maps, scope='TestScope'))
features_shape = tf.shape(proposal_classifier_features)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [3, 1, 1, 4096])
if __name__ == '__main__':
tf.test.main()
这是为寻求 SSD 架构的人提供的额外答案,Tensorflow 对象检测 API 1 已包含 slim
文件夹中的 vgg
架构,我们可以将其导入 from nets import vgg
直接地。我刚刚只尝试过 SSD 架构。我按照 ssd-mobilenet 配置从 2 层 'fc7' 和 'conv4_3' 创建各自的模型,如 paper 中所述。然后,将新 SSD-VGG16_feature_extractor.py
保存在 models
文件夹中。
注意为了与paper中的vgg
正确匹配,你应该将4096改为1024,内核大小[7,7]
改为[3,3]
正确的深度特征 https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py#L206-L209
import tensorflow.compat.v1 as tf
import tf_slim as slim
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets import vgg
class SSDVgg16FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using Vgg16 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False):
"""Vgg16 Feature Extractor for SSD Models.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
num_layers: Number of SSD layers.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
Raises:
ValueError: If `override_base_feature_extractor_hyperparams` is False.
"""
super(SSDVgg16FeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams_fn=conv_hyperparams_fn,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams)
if not self._override_base_feature_extractor_hyperparams:
raise ValueError('SSD Vgg16 feature extractor always uses'
'scope returned by `conv_hyperparams_fn` for both the '
'base feature extractor and the additional layers '
'added since there is no arg_scope defined for the base '
'feature extractor.')
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
feature_map_layout = {
'from_layer': ['FeatureExtractor/vgg_16/conv4/conv4_3', 'FeatureExtractor/vgg_16/fc7', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
with slim.arg_scope(self._conv_hyperparams_fn()):
with slim.arg_scope(vgg.vgg_arg_scope()):
_, image_features = vgg.vgg_16(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
num_classes=0)
print("Available output head: ")
print([k for k,v in image_features.items()])
with slim.arg_scope(self._conv_hyperparams_fn()):
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return list(feature_maps.values())
然后,您只需更新 'ssd_vgg16': SSDVgg16FeatureExtractor
in SSD_FEATURE_EXTRACTOR_CLASS_MAP
dict in builder/model_builder.py
即可完成模型。
我已经测试过了,效果很好
INFO:tensorflow:global_step/sec: 0.195851
I1223 18:19:21.963316 139974845604416 basic_session_run_hooks.py:692] global_step/sec: 0.195851
INFO:tensorflow:loss = 3.674446, step = 700 (510.592 sec)
I1223 18:19:21.964789 139974845604416 basic_session_run_hooks.py:260] loss = 3.674446, step = 700 (510.592 sec)
我正在创建自定义 VGG 模型作为 Tensorflow 对象检测中 Faster RCNN 模型的特征提取器 API。如文档 https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/defining_your_own_model.md 中所述,特征提取器代码由 extract_proposal_features
和 extract_classifier_features
组成。我正在使用创建卷积层的 TF slim 代码(因为 Tensorflow 团队使用它)。作为参考,请找到由 TF slim
([('vgg_16/conv1/conv1_1',
<tf.Tensor 'vgg_16/vgg_16/conv1/conv1_1/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
('vgg_16/conv1/conv1_2',
<tf.Tensor 'vgg_16/vgg_16/conv1/conv1_2/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
('vgg_16/vgg_16/pool1',
<tf.Tensor 'vgg_16/vgg_16/pool1/MaxPool:0' shape=(?, 112, 112, 64) dtype=float32>),
('vgg_16/conv2/conv2_1',
<tf.Tensor 'vgg_16/vgg_16/conv2/conv2_1/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
('vgg_16/conv2/conv2_2',
<tf.Tensor 'vgg_16/vgg_16/conv2/conv2_2/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
('vgg_16/vgg_16/pool2',
<tf.Tensor 'vgg_16/vgg_16/pool2/MaxPool:0' shape=(?, 56, 56, 128) dtype=float32>),
('vgg_16/conv3/conv3_1',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_1/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/conv3/conv3_2',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_2/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/conv3/conv3_3',
<tf.Tensor 'vgg_16/vgg_16/conv3/conv3_3/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
('vgg_16/vgg_16/pool3',
<tf.Tensor 'vgg_16/vgg_16/pool3/MaxPool:0' shape=(?, 28, 28, 256) dtype=float32>),
('vgg_16/conv4/conv4_1',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_1/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/conv4/conv4_2',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_2/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/conv4/conv4_3',
<tf.Tensor 'vgg_16/vgg_16/conv4/conv4_3/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
('vgg_16/vgg_16/pool4',
<tf.Tensor 'vgg_16/vgg_16/pool4/MaxPool:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_1',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_1/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_2',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_2/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/conv5/conv5_3',
<tf.Tensor 'vgg_16/vgg_16/conv5/conv5_3/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
('vgg_16/vgg_16/pool5',
<tf.Tensor 'vgg_16/vgg_16/pool5/MaxPool:0' shape=(?, 7, 7, 512) dtype=float32>),
('vgg_16/fc6',
<tf.Tensor 'vgg_16/vgg_16/fc6/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>),
('vgg_16/fc7',
<tf.Tensor 'vgg_16/vgg_16/fc7/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>)])
我的问题是,在extract_proposal_features
方法中需要包含和返回哪些卷积层,在extract_classifier_features
方法中需要包含和返回哪些卷积层。请告诉我。
我更改了 vgg slim 代码以获得正确的张量。
def vgg_16(inputs,
num_classes=1000,
is_training=True,
dropout_keep_prob=0.5,
spatial_squeeze=True,
scope='vgg_16',
fc_conv_padding='VALID',
global_pool=False):
"""Oxford Net VGG 16-Layers version D Example.
Note: All the fully_connected layers have been transformed to conv2d layers.
To use in classification mode, resize input to 224x224.
Args:
inputs: a tensor of size [batch_size, height, width, channels].
num_classes: number of predicted classes. If 0 or None, the logits layer is
omitted and the input features to the logits layer are returned instead.
is_training: whether or not the model is being trained.
dropout_keep_prob: the probability that activations are kept in the dropout
layers during training.
spatial_squeeze: whether or not should squeeze the spatial dimensions of the
outputs. Useful to remove unnecessary dimensions for classification.
scope: Optional scope for the variables.
fc_conv_padding: the type of padding to use for the fully connected layer
that is implemented as a convolutional layer. Use 'SAME' padding if you
are applying the network in a fully convolutional manner and want to
get a prediction map downsampled by a factor of 32 as an output.
Otherwise, the output prediction map will be (input / 32) - 6 in case of
'VALID' padding.
global_pool: Optional boolean flag. If True, the input to the classification
layer is avgpooled to size 1x1, for any input size. (This is not part
of the original VGG architecture.)
Returns:
net: the output of the logits layer (if num_classes is a non-zero integer),
or the input to the logits layer (if num_classes is 0 or None).
end_points: a dict of tensors with intermediate activations.
"""
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
outputs_collections=end_points_collection):
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
net = slim.max_pool2d(net, [2, 2], scope='pool1')
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
net = slim.max_pool2d(net, [2, 2], scope='pool4')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
net = slim.max_pool2d(net, [2, 2], scope='pool5')
# Convert end_points_collection into a end_point dict.
end_points = slim.utils.convert_collection_to_dict(end_points_collection)
end_points['head'] = net
# Use conv2d instead of fully_connected layers.
net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout6')
net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
if global_pool:
net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
end_points['global_pool'] = net
if num_classes:
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout7')
net = slim.conv2d(net, num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
scope='fc8')
if spatial_squeeze and num_classes is not None:
net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
end_points[sc.name + '/fc8'] = net
return net, end_points
end_points['head']
= net 是用于 extract_proposal_features.
def _extract_proposal_features(self, preprocessed_inputs, scope):
"""Extracts first stage RPN features.
Args:
preprocessed_inputs: A [batch, height, width, channels] float32 tensor
representing a batch of images.
scope: A scope name.
Returns:
rpn_feature_map: A tensor with shape [batch, height, width, depth]
Raises:
InvalidArgumentError: If the spatial size of `preprocessed_inputs`
(height or width) is less than 33.
ValueError: If the created network is missing the required activation.
"""
preprocessed_inputs.get_shape().assert_has_rank(4)
shape_assert = tf.Assert(
tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
['image size must at least be 33 in both height and width.'])
with tf.control_dependencies([shape_assert]):
with tf.variable_scope('vgg_16', 'vgg_16', reuse=self._reuse_weights):
_, activations = vgg.vgg_16(
preprocessed_inputs,
scope=scope)
return activations['head']
和
def _extract_box_classifier_features(self, proposal_feature_maps, scope):
"""Extracts second stage box classifier features.
Args:
proposal_feature_maps: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, crop_height, crop_width, depth]
representing the feature map cropped to each proposal.
scope: A scope name (unused).
Returns:
proposal_classifier_features: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, height, width, depth]
representing box classifier features for each proposal.
"""
net = proposal_feature_maps
with tf.variable_scope('vgg_16', reuse=self._reuse_weights):
with slim.arg_scope(
[slim.conv2d],
stride=1,
padding='VALID'):
# Use conv2d instead of fully_connected layers.
fc6 = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
if self._is_training:
fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True,
scope='dropout6')
fc7 = slim.conv2d(fc6, 4096, [1, 1], scope='fc7')
if self._is_training:
fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True,
scope='dropout7')
proposal_classifier_features = fc7
return proposal_classifier_features
我喜欢这个。不知道方法对不对:)
这是我的测试代码。
import numpy as np
import tensorflow as tf
from models import faster_rcnn_vgg_16_feature_extractor as faster_rcnn_vgg_16
class FasterRcnnVgg16FeatureExtractorTest(tf.test.TestCase):
def _build_feature_extractor(self, first_stage_features_stride):
return faster_rcnn_vgg_16.FasterRCNNVgg16FeatureExtractor(
is_training=False,
first_stage_features_stride=first_stage_features_stride,
weight_decay=0.0005)
def test_extract_proposal_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[4, 224, 224, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
def test_extract_proposal_features_stride_eight(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=8)
preprocessed_inputs = tf.random_uniform(
[4, 224, 224, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
def test_extract_proposal_features_half_size_input(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[1, 112, 112, 3], maxval=255, dtype=tf.float32)
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [1, 4, 4, 512])
def test_extract_proposal_features_dies_on_invalid_stride(self):
with self.assertRaises(ValueError):
self._build_feature_extractor(first_stage_features_stride=99)
def test_extract_proposal_features_dies_on_very_small_images(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
rpn_feature_map = feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
features_shape = tf.shape(rpn_feature_map)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
with self.assertRaises(tf.errors.InvalidArgumentError):
sess.run(
features_shape,
feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)})
def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
preprocessed_inputs = tf.random_uniform(
[224, 224, 3], maxval=255, dtype=tf.float32)
with self.assertRaises(ValueError):
feature_extractor.extract_proposal_features(
preprocessed_inputs, scope='TestScope')
def test_extract_box_classifier_features_returns_expected_size(self):
feature_extractor = self._build_feature_extractor(
first_stage_features_stride=16)
proposal_feature_maps = tf.random_uniform(
[3, 7, 7, 512], maxval=255, dtype=tf.float32)
proposal_classifier_features = (
feature_extractor.extract_box_classifier_features(
proposal_feature_maps, scope='TestScope'))
features_shape = tf.shape(proposal_classifier_features)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
features_shape_out = sess.run(features_shape)
self.assertAllEqual(features_shape_out, [3, 1, 1, 4096])
if __name__ == '__main__':
tf.test.main()
这是为寻求 SSD 架构的人提供的额外答案,Tensorflow 对象检测 API 1 已包含 slim
文件夹中的 vgg
架构,我们可以将其导入 from nets import vgg
直接地。我刚刚只尝试过 SSD 架构。我按照 ssd-mobilenet 配置从 2 层 'fc7' 和 'conv4_3' 创建各自的模型,如 paper 中所述。然后,将新 SSD-VGG16_feature_extractor.py
保存在 models
文件夹中。
注意为了与paper中的vgg
正确匹配,你应该将4096改为1024,内核大小[7,7]
改为[3,3]
正确的深度特征 https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py#L206-L209
import tensorflow.compat.v1 as tf
import tf_slim as slim
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets import vgg
class SSDVgg16FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using Vgg16 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False):
"""Vgg16 Feature Extractor for SSD Models.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
num_layers: Number of SSD layers.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
Raises:
ValueError: If `override_base_feature_extractor_hyperparams` is False.
"""
super(SSDVgg16FeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams_fn=conv_hyperparams_fn,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams)
if not self._override_base_feature_extractor_hyperparams:
raise ValueError('SSD Vgg16 feature extractor always uses'
'scope returned by `conv_hyperparams_fn` for both the '
'base feature extractor and the additional layers '
'added since there is no arg_scope defined for the base '
'feature extractor.')
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
feature_map_layout = {
'from_layer': ['FeatureExtractor/vgg_16/conv4/conv4_3', 'FeatureExtractor/vgg_16/fc7', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
with slim.arg_scope(self._conv_hyperparams_fn()):
with slim.arg_scope(vgg.vgg_arg_scope()):
_, image_features = vgg.vgg_16(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
num_classes=0)
print("Available output head: ")
print([k for k,v in image_features.items()])
with slim.arg_scope(self._conv_hyperparams_fn()):
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return list(feature_maps.values())
然后,您只需更新 'ssd_vgg16': SSDVgg16FeatureExtractor
in SSD_FEATURE_EXTRACTOR_CLASS_MAP
dict in builder/model_builder.py
即可完成模型。
我已经测试过了,效果很好
INFO:tensorflow:global_step/sec: 0.195851
I1223 18:19:21.963316 139974845604416 basic_session_run_hooks.py:692] global_step/sec: 0.195851
INFO:tensorflow:loss = 3.674446, step = 700 (510.592 sec)
I1223 18:19:21.964789 139974845604416 basic_session_run_hooks.py:260] loss = 3.674446, step = 700 (510.592 sec)