为新类别重新训练 InceptionV4 的最终层:局部变量未初始化
Retrain InceptionV4's Final Layer for New Categories: local variable not initialized
我还是 tensorflow 的新手,如果这是一个幼稚的问题,我很抱歉。我正在尝试使用 inception_V4
model pretrained on ImageNet
dataset published on this site. Also, I'm using their network as it is, I mean the one published on their site.
下面是我调用网络的方式:
def network(images_op, keep_prob):
width_needed_InceptionV4Net = 342
shape = images_op.get_shape().as_list()
H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2))
resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR)
with slim.arg_scope(inception.inception_v4_arg_scope()):
logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob)
return logits
由于我需要为我的类别重新训练 Inception_V4
的最后一层,我将 类 的数量修改为 20,如您在方法调用中所见(inception.inception_v4
).
这是我目前使用的训练方法:
def optimistic_restore(session, save_file, flags):
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
restore_vars = []
name2var = dict(zip(map(lambda x:x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
print(saved_var_name)
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags,
optimizer, log_periods, resume):
"""!@brief Trains the network for a number of steps.
@param images image tensor
@param ids id tensor
@param labels label tensor
@param total_num_examples total number of training examples
@param batch_size batch size
@param train_dir directory where checkpoints should be saved
@param network pointer to a function describing the network
@param flags command-line arguments
@param optimizer pointer to the optimization class
@param log_periods list containing the step intervals at which 1) logs should be printed,
2) logs should be saved for TensorBoard and 3) variables should be saved
@param resume should training be resumed (or restarted from scratch)?
@return the number of training steps performed since the first call to 'train'
"""
# clearing the training directory
if not resume:
if tf.gfile.Exists(train_dir):
tf.gfile.DeleteRecursively(train_dir)
tf.gfile.MakeDirs(train_dir)
print('Training the network in directory %s...' % train_dir)
global_step = tf.Variable(0, trainable = False)
# creating a placeholder, set to ones, used to assess the importance of each pixel
mask, ones = _mask(images, batch_size, flags)
# building a Graph that computes the logits predictions from the inference model
keep_prob = tf.placeholder_with_default(0.5, [])
logits = network(images * mask, keep_prob)
# creating the optimizer
if optimizer == tf.train.MomentumOptimizer:
opt = optimizer(flags.learning_rate, flags.momentum)
else:
opt = optimizer(flags.learning_rate)
# calculating the semantic loss, defined as the classification or regression loss
if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights):
boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32,
delimiter = ',')
boosting_weights = tf.placeholder_with_default(boosting_weights_value,
list(boosting_weights_value.shape),
name = 'boosting_weights')
semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags)
else:
semantic_loss = _loss(logits, labels, flags)
tf.add_to_collection('losses', semantic_loss)
# computing the loss gradient with respect to the mask (i.e. the insight tensor) and
# penalizing its L1-norm
# replace 'semantic_loss' with 'tf.reduce_sum(logits)'?
insight = tf.gradients(semantic_loss, [mask])[0]
insight_loss = tf.reduce_sum(tf.abs(insight))
if flags.insight_loss > 0.0:
with tf.control_dependencies([semantic_loss]):
tf.add_to_collection('losses', tf.multiply(flags.insight_loss, insight_loss,
name = 'insight_loss'))
else:
tf.summary.scalar('insight_loss_raw', insight_loss)
# summing all loss factors and computing the moving average of all individual losses and of
# the sum
loss = tf.add_n(tf.get_collection('losses'), name = 'total_loss')
loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = 'avg')
losses = tf.get_collection('losses')
loss_averages = loss_averages_op.apply(losses + [loss])
# attaching a scalar summary to all individual losses and the total loss;
# do the same for the averaged version of the losses
for l in losses + [loss]:
tf.summary.scalar(l.op.name + '_raw', l)
tf.summary.scalar(l.op.name + '_avg', loss_averages_op.average(l))
# computing and applying gradients
with tf.control_dependencies([loss_averages]):
grads = opt.compute_gradients(loss)
apply_gradient = opt.apply_gradients(grads, global_step = global_step)
# adding histograms for trainable variables and gradients
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
tf.summary.histogram('insight', insight)
# tracking the moving averages of all trainable variables
variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay,
global_step)
variable_averages = variable_averages_op.apply(tf.trainable_variables())
# building a Graph that trains the model with one batch of examples and
# updates the model parameters
with tf.control_dependencies([apply_gradient, variable_averages]):
train_op = tf.no_op(name = 'train')
# creating a saver
saver = tf.train.Saver(tf.global_variables())
# building the summary operation based on the TF collection of Summaries
summary_op = tf.summary.merge_all()
# creating a session
current_global_step = -1
with tf.Session(config = tf.ConfigProto(log_device_placement = False,
inter_op_parallelism_threads = flags.num_cpus,
device_count = {'GPU': flags.num_gpus})) as sess:
# initializing variables
if flags.checkpoint_exclude_scopes is not None:
optimistic_restore(sess, os.path.join(train_dir, 'inception_V4.ckpt'), flags)
# starting the queue runners
..
# creating a summary writer
..
# training itself
..
# saving the model checkpoint
checkpoint_path = os.path.join(train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = current_global_step)
# stopping the queue runners
..
return current_global_step
我在名为 checkpoint_exclude_scopes
的 python 脚本中添加了一个标志,我在其中确定了不应恢复的张量。这需要更改网络最后一层中 类 的数量。下面是我如何调用 python 脚本:
./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits
我的第一次测试很糟糕,因为我遇到了太多问题.. 比如:
tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files
经过一番谷歌搜索后,我找到了解决此问题的方法 site 他们建议使用上面代码中提供的函数 optimistic_restore
,包括对其进行一些修改。
但现在的问题是别的:
W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable
[[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]
好像有一个局部变量没有初始化,但我找不到它。你能帮忙吗?
已编辑:
为了调试这个问题,我通过在函数optimistic_restore
中添加一些日志来检查应该初始化和恢复的变量的数量。这是一个简介:
# saved_shapes 609
# var_names 608
# name2var 1519
# variables_to_init: 7
# restore_vars: 596
# global_variables: 1519
供您参考,CheckpointReader.get_variable_to_shape_map():
returns 将张量名称映射到整数列表的字典,表示检查点中相应张量的形状。这意味着此检查点中的变量数为 609
,恢复所需的变量总数为 1519
。
似乎预训练的检查点张量与网络架构(实际上也是他们的网络)使用的变量之间存在巨大差距。是否对检查点进行了某种压缩?我说的准确吗?
我现在知道缺少了什么:它只是尚未恢复的变量的初始化。然而,我需要知道为什么他们的 InceptionV4
网络架构和预训练检查点之间存在巨大差异?
没有用saver恢复的变量需要初始化。为此,您可以 运行 v.initializer.run()
为您不恢复的每个变量 v
。
下面是我应该如何定义 optimistic_restore
函数以使其按预期工作:
def optimistic_restore(session, save_file, flags):
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
print ('saved_shapes %d' % len(saved_shapes))
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
var_names_to_be_initialized = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] not in saved_shapes])
print('var_names %d' % len(var_names))
print('var_names_to_be_initialized %d' % len(var_names_to_be_initialized))
restore_vars = []
name2var = dict(zip(map(lambda x: x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
print('name2var %d' % len(name2var))
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(curr_var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
else:
variables2_to_init.append(curr_var)
for var_name, saved_var_name in var_names_to_be_initialized:
curr_var = name2var[saved_var_name]
variables2_to_init.append(curr_var)
print('variables2_to_init : %d ' % len(variables_to_init))
print('global_variables: %d ' % len(tf.global_variables()))
print('restore_vars: %d ' % len(restore_vars))
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
session.run(tf.variables_initializer(variables_to_init))
我还是 tensorflow 的新手,如果这是一个幼稚的问题,我很抱歉。我正在尝试使用 inception_V4
model pretrained on ImageNet
dataset published on this site. Also, I'm using their network as it is, I mean the one published on their site.
下面是我调用网络的方式:
def network(images_op, keep_prob):
width_needed_InceptionV4Net = 342
shape = images_op.get_shape().as_list()
H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2))
resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR)
with slim.arg_scope(inception.inception_v4_arg_scope()):
logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob)
return logits
由于我需要为我的类别重新训练 Inception_V4
的最后一层,我将 类 的数量修改为 20,如您在方法调用中所见(inception.inception_v4
).
这是我目前使用的训练方法:
def optimistic_restore(session, save_file, flags):
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
restore_vars = []
name2var = dict(zip(map(lambda x:x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
print(saved_var_name)
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags,
optimizer, log_periods, resume):
"""!@brief Trains the network for a number of steps.
@param images image tensor
@param ids id tensor
@param labels label tensor
@param total_num_examples total number of training examples
@param batch_size batch size
@param train_dir directory where checkpoints should be saved
@param network pointer to a function describing the network
@param flags command-line arguments
@param optimizer pointer to the optimization class
@param log_periods list containing the step intervals at which 1) logs should be printed,
2) logs should be saved for TensorBoard and 3) variables should be saved
@param resume should training be resumed (or restarted from scratch)?
@return the number of training steps performed since the first call to 'train'
"""
# clearing the training directory
if not resume:
if tf.gfile.Exists(train_dir):
tf.gfile.DeleteRecursively(train_dir)
tf.gfile.MakeDirs(train_dir)
print('Training the network in directory %s...' % train_dir)
global_step = tf.Variable(0, trainable = False)
# creating a placeholder, set to ones, used to assess the importance of each pixel
mask, ones = _mask(images, batch_size, flags)
# building a Graph that computes the logits predictions from the inference model
keep_prob = tf.placeholder_with_default(0.5, [])
logits = network(images * mask, keep_prob)
# creating the optimizer
if optimizer == tf.train.MomentumOptimizer:
opt = optimizer(flags.learning_rate, flags.momentum)
else:
opt = optimizer(flags.learning_rate)
# calculating the semantic loss, defined as the classification or regression loss
if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights):
boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32,
delimiter = ',')
boosting_weights = tf.placeholder_with_default(boosting_weights_value,
list(boosting_weights_value.shape),
name = 'boosting_weights')
semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags)
else:
semantic_loss = _loss(logits, labels, flags)
tf.add_to_collection('losses', semantic_loss)
# computing the loss gradient with respect to the mask (i.e. the insight tensor) and
# penalizing its L1-norm
# replace 'semantic_loss' with 'tf.reduce_sum(logits)'?
insight = tf.gradients(semantic_loss, [mask])[0]
insight_loss = tf.reduce_sum(tf.abs(insight))
if flags.insight_loss > 0.0:
with tf.control_dependencies([semantic_loss]):
tf.add_to_collection('losses', tf.multiply(flags.insight_loss, insight_loss,
name = 'insight_loss'))
else:
tf.summary.scalar('insight_loss_raw', insight_loss)
# summing all loss factors and computing the moving average of all individual losses and of
# the sum
loss = tf.add_n(tf.get_collection('losses'), name = 'total_loss')
loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = 'avg')
losses = tf.get_collection('losses')
loss_averages = loss_averages_op.apply(losses + [loss])
# attaching a scalar summary to all individual losses and the total loss;
# do the same for the averaged version of the losses
for l in losses + [loss]:
tf.summary.scalar(l.op.name + '_raw', l)
tf.summary.scalar(l.op.name + '_avg', loss_averages_op.average(l))
# computing and applying gradients
with tf.control_dependencies([loss_averages]):
grads = opt.compute_gradients(loss)
apply_gradient = opt.apply_gradients(grads, global_step = global_step)
# adding histograms for trainable variables and gradients
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
tf.summary.histogram('insight', insight)
# tracking the moving averages of all trainable variables
variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay,
global_step)
variable_averages = variable_averages_op.apply(tf.trainable_variables())
# building a Graph that trains the model with one batch of examples and
# updates the model parameters
with tf.control_dependencies([apply_gradient, variable_averages]):
train_op = tf.no_op(name = 'train')
# creating a saver
saver = tf.train.Saver(tf.global_variables())
# building the summary operation based on the TF collection of Summaries
summary_op = tf.summary.merge_all()
# creating a session
current_global_step = -1
with tf.Session(config = tf.ConfigProto(log_device_placement = False,
inter_op_parallelism_threads = flags.num_cpus,
device_count = {'GPU': flags.num_gpus})) as sess:
# initializing variables
if flags.checkpoint_exclude_scopes is not None:
optimistic_restore(sess, os.path.join(train_dir, 'inception_V4.ckpt'), flags)
# starting the queue runners
..
# creating a summary writer
..
# training itself
..
# saving the model checkpoint
checkpoint_path = os.path.join(train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = current_global_step)
# stopping the queue runners
..
return current_global_step
我在名为 checkpoint_exclude_scopes
的 python 脚本中添加了一个标志,我在其中确定了不应恢复的张量。这需要更改网络最后一层中 类 的数量。下面是我如何调用 python 脚本:
./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits
我的第一次测试很糟糕,因为我遇到了太多问题.. 比如:
tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files
经过一番谷歌搜索后,我找到了解决此问题的方法 site 他们建议使用上面代码中提供的函数 optimistic_restore
,包括对其进行一些修改。
但现在的问题是别的:
W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable
[[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]
好像有一个局部变量没有初始化,但我找不到它。你能帮忙吗?
已编辑:
为了调试这个问题,我通过在函数optimistic_restore
中添加一些日志来检查应该初始化和恢复的变量的数量。这是一个简介:
# saved_shapes 609
# var_names 608
# name2var 1519
# variables_to_init: 7
# restore_vars: 596
# global_variables: 1519
供您参考,CheckpointReader.get_variable_to_shape_map():
returns 将张量名称映射到整数列表的字典,表示检查点中相应张量的形状。这意味着此检查点中的变量数为 609
,恢复所需的变量总数为 1519
。
似乎预训练的检查点张量与网络架构(实际上也是他们的网络)使用的变量之间存在巨大差距。是否对检查点进行了某种压缩?我说的准确吗?
我现在知道缺少了什么:它只是尚未恢复的变量的初始化。然而,我需要知道为什么他们的 InceptionV4
网络架构和预训练检查点之间存在巨大差异?
没有用saver恢复的变量需要初始化。为此,您可以 运行 v.initializer.run()
为您不恢复的每个变量 v
。
下面是我应该如何定义 optimistic_restore
函数以使其按预期工作:
def optimistic_restore(session, save_file, flags):
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
print ('saved_shapes %d' % len(saved_shapes))
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
var_names_to_be_initialized = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] not in saved_shapes])
print('var_names %d' % len(var_names))
print('var_names_to_be_initialized %d' % len(var_names_to_be_initialized))
restore_vars = []
name2var = dict(zip(map(lambda x: x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
print('name2var %d' % len(name2var))
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(curr_var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
else:
variables2_to_init.append(curr_var)
for var_name, saved_var_name in var_names_to_be_initialized:
curr_var = name2var[saved_var_name]
variables2_to_init.append(curr_var)
print('variables2_to_init : %d ' % len(variables_to_init))
print('global_variables: %d ' % len(tf.global_variables()))
print('restore_vars: %d ' % len(restore_vars))
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
session.run(tf.variables_initializer(variables_to_init))