如何在 tensorflow 中使用 BatchNormalization？

Question

我在使用带有 tensorflow 的批量归一化时遇到问题。我建立了以下模型：

def weight_variable(kernal_shape):
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
                        initializer=tf.truncated_normal_initializer(stddev=0.02))
    return weights
def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial)

# return 1 conv layer
def conv_layer(x, w_shape, b_shape, is_training, padding='SAME'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram("biases", b)

    # Note that I used a stride of 2 on purpose in order not to use max pool layer.
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
    conv = tf.contrib.layers.batch_norm(conv, scale=True, is_training=is_training)

    activations = tf.nn.relu(conv)

    tf.summary.histogram("activations", activations)

    return activations

# return deconv layer
def deconv_layer(x, w_shape, b_shape, is_training, padding="SAME", activation='relu'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram('biases', b)

    x_shape = tf.shape(x)
    # output shape: [batch_size, h * 2, w * 2, input_shape from w].
    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
    # Note that I have used a stride of 2 since I used a stride of 2 in conv layer.

    conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
    conv_trans = tf.contrib.layers.batch_norm(conv_trans, scale=True, is_training=is_training)

    if activation == 'relu':
        transposed_activations = tf.nn.relu(conv_trans)
    else:
        transposed_activations = tf.nn.sigmoid(conv_trans)

    tf.summary.histogram("transpose_activation", transposed_activations)
    return transposed_activations

def model(input):
    with tf.variable_scope('conv1'):
        conv1 = conv_layer(input, [4, 4, 3, 32], [32], is_training=phase_train)  # image size: [56, 56]
    with tf.variable_scope('conv2'):
        conv2 = conv_layer(conv1, [4, 4, 32, 64], [64], is_training=phase_train)  # image size: [28, 28]
    with tf.variable_scope('conv3'):
        conv3 = conv_layer(conv2, [4, 4, 64, 128], [128], is_training=phase_train)  # image size: [14, 14]
    with tf.variable_scope('conv4'):
        conv4 = conv_layer(conv3, [4, 4, 128, 256], [256], is_training=phase_train)  # image size: [7, 7]
        conv4_reshaped = tf.reshape(conv4, [batch_size * num_participants, 7 * 7 * 256], name='conv4_reshaped')

    w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
    b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
    w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
    b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
    epsilon = tf.random_normal([1, latent_dim])

    tf.summary.histogram('weights_c_mu', w_c_mu)
    tf.summary.histogram('biases_c_mu', b_c_mu)
    tf.summary.histogram('weights_c_sig', w_c_sig)
    tf.summary.histogram('biases_c_sig', b_c_sig)

    with tf.variable_scope('mu'):
        mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
        tf.summary.histogram('mu', mu)

    with tf.variable_scope('stddev'):
        stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
        tf.summary.histogram('stddev', stddev)

    with tf.variable_scope('z'):
        # This formula was adopted from the following paper: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7979344
        latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
        tf.summary.histogram('features_sig', stddev)

    with tf.variable_scope('GRU'):
        print(latent_var.get_shape().as_list())
        latent_var = tf.reshape(latent_var, shape=[int(batch_size / 100)* num_participants, time_steps, latent_dim])

        cell = tf.nn.rnn_cell.GRUCell(cell_size)   # state_size of cell_size.
        H, C = tf.nn.dynamic_rnn(cell, latent_var, dtype=tf.float32)  # H size: [batch_size * num_participants, SEQLEN, cell_size]
        H = tf.reshape(H, [batch_size * num_participants, cell_size])

    with tf.variable_scope('output'):
        # output layer.
        w_output = tf.Variable(tf.truncated_normal([cell_size, 1], mean=0, stddev=0.01, dtype=tf.float32, name='w_output'))
        tf.summary.histogram('w_output', w_output)
        b_output = tf.get_variable('b_output', shape=[1], dtype=tf.float32,
                                   initializer=tf.constant_initializer(0.0))
        predictions = tf.add(tf.matmul(H, w_output), b_output, name='softmax_output')
        tf.summary.histogram('output', predictions)

        var_list = [v for v in tf.global_variables() if 'GRU' in v.name]
        var_list.append([w_output, b_output])

    return predictions, var_list

另外，我正在恢复模型参数如下：

saver_torestore = tf.train.Saver()

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(events_path, sess.graph)
    merged = tf.summary.merge_all()

    to_run_list = [merged, RMSE]

    # Initialize `iterator` with training data.
    sess.run(init_op)

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
    # under tensorboard_logs.
    ckpt = tf.train.get_checkpoint_state(
        os.path.dirname(model_path))
    if ckpt and ckpt.model_checkpoint_path:
        saver_torestore.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoints are saved!!!')
    else:
        print('No stored checkpoints')

    counter = 0
    for _ in range(num_epoch):
        sess.run(iterator.initializer)
        print('epoch:', _)

        # This while loop will run indefinitly until the end of the first epoch
        while True:
            try:
                summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: False})

                print('loss: ' + str(loss_))

                losses.append(loss_)
                counter += 1

                train_writer.add_summary(summary, counter)

            except tf.errors.OutOfRangeError:
                print('error, ignore ;) ')
                break

     print('average losses:', np.average(losses))
     train_writer.close()

我确保保存了变量。所以我运行下面的命令：

def assign_values_to_batchNorm():
    vars = [v for v in tf.global_variables() if "BatchNorm" in v.name and "Adam" not in v.name]
    file_names = [(v.name[:-2].replace("/", "_") + ".txt") for v in vars]
    for var, file_name in zip(vars, file_names):
        lst = open(file_name).read().split(";")[:-1]
        print(lst)
        values = list(map(np.float32, lst))
        tf.assign(var, values)

请注意，我使用此方法是为了手动恢复移动均值和移动方差的值。但是我得到了相同的结果。

然后我在会话下调用了 assign_values_to_batchNorm()。我得到了一些值 => 似乎移动平均值、移动方差、伽玛和贝塔都被保存了。

现在请注意，我正在开发 windows 10，并且我有 tensorflow 1.3 版。

所以，每当我运行 summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: True}) 在会话下，在 initializing/restoring 所有变量之后，我得到的 RMSE 为 0.022，这与在结束时实现的错误相同训练模型。现在，如果我将 phase_train 设置为 false，我得到的 RMSE 为 0.038。请注意，我只是同时测试网络。因此，即使我使用训练数据集进行测试，但我的目的只是测试网络的行为，同时 training/testing。所以这对我来说太奇怪了。请注意，阶段是占位符。我的代码如下：

phase_train = tf.placeholder(dtype=tf.bool, name='phase')

此外，这里是优化器的代码片段：

with tf.name_scope('optimizer'):
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(0.00001).minimize(RMSE)

主要问题：当相位 = 假时 RMSE = 0.038，当相位 = 真时 RMSE = 0.022。

非常感谢任何帮助！！

Answer 1

重述问题

看起来你的担心是这样的：除了 phase_train 的值外，你保持一切相同（训练数据、批次数、训练数据的轮数、初始化等）。当 phase_train=True 时，训练的 RMSE 为 0.022，当 phase_train=False 时，训练的 RMSE 为 0.038，您认为无论 phase_train 的值如何，RMSE 都应该是相同（0.022 或 0.038）。如果这不是你的意思，请告诉我。

问题的答案

这里的答案是 RMSE 应该在 phase_train=True 和 phase_train=False 时不同。让我们看看为什么会这样。

你这样设置图表：

conv1 = conv_layer(input, [4, 4, 3, 32], [32], is_training=phase_train)

然后，如果我们查看您在 conv_layer(...) 函数中的代码，您可以像这样使用 is_training 变量：

conv = tf.contrib.layers.batch_norm(conv, scale=True, is_training=is_training)

现在，让我们看一下 tf.contrib.layers.batch_norm (https://www.tensorflow.org/api_docs/python/tf/contrib/layers/batch_norm) 的文档：

is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into moving_mean and moving_variance using an exponential moving average with the given decay. When it is not in training mode then it would use the values of the moving_mean and the moving_variance.

从文档中可以看出，is_training=True 导致与 is_training=False 不同的功能。具体来说，当 is_training=True 时，使用衰减计算归一化常数，而当 is_training=False 时，则没有衰减。当您切换 is_training 的值时，您的代码会执行不同的操作，因此您的 RMSE 误差会有所不同。

如果您运行遇到更多此类问题，查看 Tensorflow 文档可能有助于解释意外结果。而且，不建议在某处将 is_training 标志设置为 False 来训练模型。

为什么在 is_training=True 时使用衰减？

您可能想知道为什么 Tensorflow 在 is_training=True 时会添加衰减。答案是，当您训练时，您的神经网络中的权重会更新变得越来越好。这意味着您早期更新的均值和方差非常不准确，而您后来更新的均值和方差非常准确。由于较早的更新不准确，您希望在神经网络权重应如何更新方面减少他们的发言权，因此您在每次后续更新时衰减 0.999。

例如，当 is_training=True 时，第一次更新权重的均值和方差与第 10,000 次更新的均值和方差一样重要 0.999^10000 ~ 0.000045。这是有道理的，因为在您的第一次更新中，您的权重基本上是随机的，并且绝对不会产生与第 10000 次更新时的均值和方差一样有意义的均值和方差。

当 is_training=False 时，这意味着您告诉 Tensorflow 您已经在神经网络中学习了适当的权重。你是在告诉 Tensorflow 你已经训练了所有东西，权重是有意义的，你在 batch norm 中得到的均值和方差也是有意义的。所以，没有必要腐烂任何东西。

此解释与您的 RMSE 误差一致。如果您使用 is_training=False 进行训练，您会因为对初始化神经网络所用的随机权重赋予更多重要性而变得效率低下，因此您的最终模型不会那么好。正如您观察到的，is_training=False 运行的 RMSE 误差高于 is_training=True 运行.

的 RMSE 误差

Answer 2

所以我认为使用batch normalization层可能有问题。所以我创建了一个简单的模型并在 MNIST 数据集上对其进行了训练。所以我们有两种情况，第一种情况，使用批量规范训练模型，第二种情况，不使用批量规范进行训练。

现在，如果我们比较测试结果，使用和不使用 batch norm，我们会发现使用 BN 时我们获得了更高的准确性或更低的损失。请记住，包含 BN 的模型在测试阶段设置为 false。因此，我们可以得出结论，具有 BN 的模型优于没有 BN 的模型。

其次，如果我们考虑使用批量归一化训练的模型。现在，如果我们比较测试集上的损失（一方面将 phase 设置为 True，另一方面将 phase 设置为 False），我们得出结论，将 phase 设置为真的。因为，直觉上，使用当前批次的统计数据比训练数据集的统计数据更准确。

总而言之，我的问题出现在使用批量归一化训练模型，并在将相位设置为 True 和 False 的同时测试模型。因此，当将 phase 设置为 true 而不是 false 时，我们肯定会得到更好的损失（更低）。

如何在 tensorflow 中使用 BatchNormalization？

How to use BatchNormalization with tensorflow?

python

tensorflow

batch-normalization

重述问题

问题的答案

为什么在 is_training=True 时使用衰减？