tf.slim 和 inception_v1 模型验证中的意外行为
Unexpected behavior in model validation with tf.slim and inception_v1
我正在尝试使用 tf.slim 中编写的 inception_v1 模块提供 here 在 CIFAR 10 数据集上训练模型。
在数据集上训练和评估模型的代码如下。
# test_data = (data['images_test'], data['labels_test'])
train_data = (train_x, train_y)
val_data = (val_x, val_y)
# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data).shuffle(buffer_size=10000).batch(BATCH_SIZE).map(preprocess)
# train_dataset = train_dataset.shuffle(buffer_size=10000).batch(BATCH_SIZE).map(preprocess)
val_dataset = tf.data.Dataset.from_tensor_slices(val_data).batch(BATCH_SIZE).map(preprocess)
# test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(BATCH_SIZE).map(preprocess)
# create a _iterator of the correct shape and type
_iter = tf.data.Iterator.from_structure(
train_dataset.output_types,
train_dataset.output_shapes
)
features, labels = _iter.get_next()
# create the initialization operations
train_init_op = _iter.make_initializer(train_dataset)
val_init_op = _iter.make_initializer(val_dataset)
# test_init_op = _iter.make_initializer(test_dataset)
# Placeholders which evaluate in the session
training_mode = tf.placeholder(shape=None, dtype=tf.bool)
dropout_prob = tf.placeholder_with_default(1.0, shape=())
reuse_bool = tf.placeholder_with_default(True, shape=())
# Init the saver Object which handles saves and restores of
# model weights
# saver = tf.train.Saver()
# Initialize the model inside the arg_scope to define the batch
# normalization layer and the appropriate parameters
with slim.arg_scope(inception_v1_arg_scope(use_batch_norm=True)) as scope:
logits, end_points = inception_v1(features,
reuse=None,
dropout_keep_prob=dropout_prob, is_training=training_mode)
# Create the cross entropy loss function
cross_entropy = tf.reduce_mean(
tf.losses.softmax_cross_entropy(tf.one_hot(labels, 10), logits))
train_op = tf.train.AdamOptimizer(1e-2).minimize(loss=cross_entropy)
# train_op = slim.learning.create_train_op(cross_entropy, optimizer, global_step=)
# Define the accuracy metric
preds = tf.argmax(logits, axis=-1, output_type=tf.int64)
acc = tf.reduce_mean(tf.cast(tf.equal(preds, labels), tf.float32))
# Count the iterations for each set
n_train_batches = train_y.shape[0] // BATCH_SIZE
n_val_batches = val_y.shape[0] // BATCH_SIZE
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# saver = tf.train.Saver([v for v in tf.all_variables()][:-1])
# for v in tf.all_variables():
# print(v.name)
# saver.restore(sess, tf.train.latest_checkpoint('./', latest_filename='inception_v1.ckpt'))
for i in range(EPOCHS):
total_loss = 0
total_acc = 0
# Init train session
sess.run(train_init_op)
with tqdm(total=n_train_batches * BATCH_SIZE) as pbar:
for batch in range(n_train_batches):
_, loss, train_acc = sess.run([train_op, cross_entropy, acc], feed_dict={training_mode: True, dropout_prob: 0.2})
total_loss += loss
total_acc += train_acc
pbar.update(BATCH_SIZE)
print("Epoch: {} || Loss: {:.5f} || Acc: {:.5f} %".\
format(i+1, total_loss / n_train_batches, (total_acc / n_train_batches)*100))
# Switch to validation
total_val_loss = 0
total_val_acc = 0
sess.run(val_init_op)
for batch in range(n_val_batches):
val_loss, val_acc = sess.run([cross_entropy, acc], feed_dict={training_mode: False})
total_val_loss += val_loss
total_val_acc += val_acc
print("Epoch: {} || Validation Loss: {:.5f} || Val Acc: {:.5f} %".\
format(i+1, total_val_loss / n_val_batches, (total_val_acc / n_val_batches) * 100))
矛盾的是,我在验证集上训练和评估模型时得到以下结果:
纪元:1 ||损失:2.29436 ||累计:23.61750 %
│纪元:1 ||验证损失:1158854431554614016.00000 ||增值税:10.03000 %
│100%|███████████████████████████████████████████████ █████| 40000/40000 [03:52<00:00, 173.21it/s]
│纪元:2 ||损失:1.68389 ||累计:36.49250 %
│纪元:2 ||验证损失:27997399226326712.00000 ||增值税:10.03000 %
│100%|███████████████████████████████████████████████ ████▋| 39800/40000 [03:51<00:01, 174.11it/s]
我在训练期间将 training_mode 设置为 true,在验证期间将其设置为 false。但是,关于仅在训练阶段设置的 train_op 模型似乎在验证集中未设置。我的猜测是 is_training 变量不能很好地处理这种情况,并且不会在验证中保持批归一化的变量初始化。有没有人遇到过类似的情况?
我找到了解决问题的办法。这个问题涉及两件事。
第一个是由于小于 imagenet 数据集而设置较小的批量规范衰减,我应该将其降低到 0.99
。
batch_norm_decay=0.99
另一件事是使用以下行来跟踪批量归一化层的可训练参数。
train_op = slim.learning.create_train_op(cross_entropy, optimizer)
我正在尝试使用 tf.slim 中编写的 inception_v1 模块提供 here 在 CIFAR 10 数据集上训练模型。
在数据集上训练和评估模型的代码如下。
# test_data = (data['images_test'], data['labels_test'])
train_data = (train_x, train_y)
val_data = (val_x, val_y)
# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data).shuffle(buffer_size=10000).batch(BATCH_SIZE).map(preprocess)
# train_dataset = train_dataset.shuffle(buffer_size=10000).batch(BATCH_SIZE).map(preprocess)
val_dataset = tf.data.Dataset.from_tensor_slices(val_data).batch(BATCH_SIZE).map(preprocess)
# test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(BATCH_SIZE).map(preprocess)
# create a _iterator of the correct shape and type
_iter = tf.data.Iterator.from_structure(
train_dataset.output_types,
train_dataset.output_shapes
)
features, labels = _iter.get_next()
# create the initialization operations
train_init_op = _iter.make_initializer(train_dataset)
val_init_op = _iter.make_initializer(val_dataset)
# test_init_op = _iter.make_initializer(test_dataset)
# Placeholders which evaluate in the session
training_mode = tf.placeholder(shape=None, dtype=tf.bool)
dropout_prob = tf.placeholder_with_default(1.0, shape=())
reuse_bool = tf.placeholder_with_default(True, shape=())
# Init the saver Object which handles saves and restores of
# model weights
# saver = tf.train.Saver()
# Initialize the model inside the arg_scope to define the batch
# normalization layer and the appropriate parameters
with slim.arg_scope(inception_v1_arg_scope(use_batch_norm=True)) as scope:
logits, end_points = inception_v1(features,
reuse=None,
dropout_keep_prob=dropout_prob, is_training=training_mode)
# Create the cross entropy loss function
cross_entropy = tf.reduce_mean(
tf.losses.softmax_cross_entropy(tf.one_hot(labels, 10), logits))
train_op = tf.train.AdamOptimizer(1e-2).minimize(loss=cross_entropy)
# train_op = slim.learning.create_train_op(cross_entropy, optimizer, global_step=)
# Define the accuracy metric
preds = tf.argmax(logits, axis=-1, output_type=tf.int64)
acc = tf.reduce_mean(tf.cast(tf.equal(preds, labels), tf.float32))
# Count the iterations for each set
n_train_batches = train_y.shape[0] // BATCH_SIZE
n_val_batches = val_y.shape[0] // BATCH_SIZE
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# saver = tf.train.Saver([v for v in tf.all_variables()][:-1])
# for v in tf.all_variables():
# print(v.name)
# saver.restore(sess, tf.train.latest_checkpoint('./', latest_filename='inception_v1.ckpt'))
for i in range(EPOCHS):
total_loss = 0
total_acc = 0
# Init train session
sess.run(train_init_op)
with tqdm(total=n_train_batches * BATCH_SIZE) as pbar:
for batch in range(n_train_batches):
_, loss, train_acc = sess.run([train_op, cross_entropy, acc], feed_dict={training_mode: True, dropout_prob: 0.2})
total_loss += loss
total_acc += train_acc
pbar.update(BATCH_SIZE)
print("Epoch: {} || Loss: {:.5f} || Acc: {:.5f} %".\
format(i+1, total_loss / n_train_batches, (total_acc / n_train_batches)*100))
# Switch to validation
total_val_loss = 0
total_val_acc = 0
sess.run(val_init_op)
for batch in range(n_val_batches):
val_loss, val_acc = sess.run([cross_entropy, acc], feed_dict={training_mode: False})
total_val_loss += val_loss
total_val_acc += val_acc
print("Epoch: {} || Validation Loss: {:.5f} || Val Acc: {:.5f} %".\
format(i+1, total_val_loss / n_val_batches, (total_val_acc / n_val_batches) * 100))
矛盾的是,我在验证集上训练和评估模型时得到以下结果:
纪元:1 ||损失:2.29436 ||累计:23.61750 % │纪元:1 ||验证损失:1158854431554614016.00000 ||增值税:10.03000 % │100%|███████████████████████████████████████████████ █████| 40000/40000 [03:52<00:00, 173.21it/s] │纪元:2 ||损失:1.68389 ||累计:36.49250 % │纪元:2 ||验证损失:27997399226326712.00000 ||增值税:10.03000 % │100%|███████████████████████████████████████████████ ████▋| 39800/40000 [03:51<00:01, 174.11it/s]
我在训练期间将 training_mode 设置为 true,在验证期间将其设置为 false。但是,关于仅在训练阶段设置的 train_op 模型似乎在验证集中未设置。我的猜测是 is_training 变量不能很好地处理这种情况,并且不会在验证中保持批归一化的变量初始化。有没有人遇到过类似的情况?
我找到了解决问题的办法。这个问题涉及两件事。
第一个是由于小于 imagenet 数据集而设置较小的批量规范衰减,我应该将其降低到 0.99
。
batch_norm_decay=0.99
另一件事是使用以下行来跟踪批量归一化层的可训练参数。
train_op = slim.learning.create_train_op(cross_entropy, optimizer)