Keras 中的 BatchNormalization 层给出了意外的输出值
BatchNormalization layer in Keras gives unexpected output values
给定输入值 [1, 5]
并对其进行归一化,应该会产生类似 [-1, 1]
if I understand correctly 的结果,因为
mean = 3
var = 4
result = (x - mean) / sqrt(var)
但是这个最小的例子
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input
from keras.layers.normalization import BatchNormalization
from keras import backend as K
shape = (1,2,1)
input = Input(shape=shape)
x = BatchNormalization(center=False)(input) # no beta
model = Model(inputs=input, outputs=x)
model.compile(loss='mse', optimizer='sgd')
# training with dummy data
training_in = [np.random.random(size=(10, *shape))]
training_out = [np.random.random(size=(10, *shape))]
model.fit(training_in, training_out, epochs=10)
data_in = np.array([[[[1], [5]]]], dtype=np.float32)
data_out = model.predict(data_in)
print('gamma :', K.eval(model.layers[1].gamma))
#print('beta :', K.eval(model.layers[1].beta))
print('moving_mean:', K.eval(model.layers[1].moving_mean))
print('moving_variance:', K.eval(model.layers[1].moving_variance))
print('epsilon :', model.layers[1].epsilon)
print('data_in :', data_in)
print('data_out:', data_out)
产生以下输出:
gamma : [ 0.80644524]
moving_mean: [ 0.05885344]
moving_variance: [ 0.91000736]
epsilon : 0.001
data_in : [[[[ 1.]
[ 5.]]]]
data_out: [[[[ 0.79519051]
[ 4.17485714]]]]
所以是 [0.79519051, 4.17485714]
而不是 [-1, 1]
。
我看了一下source, and the values seem to be forwarded to tf.nn.batch_normalization. And this looks好像结果应该是我排除的结果,但显然不是。
那么输出值是如何计算的?
如果您使用 gamma
,正确的等式实际上是 result = gamma * (x - mean) / sqrt(var)
用于批量归一化,BUT mean
和 var
并不总是相同的:
在训练(拟合)期间,它们是 mean_batch
和 var_batch
使用批次的输入值计算的(它们只是批次的均值和方差)) ,就像你在做的那样。同时,一个全局的moving_mean
和moving_variance
是这样学习的:moving_mean = alpha * moving_mean + (1-alpha) * mean_batch
,alpha是一种学习率,在(0,1),通常在0.9以上。 moving_mean
和 moving_variance
是所有 训练 数据的真实均值和方差的近似值。 Gamma
也通过通常的梯度下降来学习,以最适合您的输出。
在推理(预测)时,你只是使用moving_mean
和moving_variance
的学习值,而不是mean_batch
和var_batch
。你也用学过的gamma
。
所以 0.05885344
只是随机输入数据的均值的近似值,0.91000736
的方差,您正在使用这些来规范化新数据 [1, 5]。您可以轻松检查 [0.79519051, 4.17485714]=gamma * ([1, 5] - moving_mean)/sqrt(moving_var)
编辑:alpha
在keras中叫做动量,如果你想检查一下。
正确的公式是这样的:
result = gamma * (input - moving_mean) / sqrt(moving_variance + epsilon) + beta
这里是验证脚本:
import math
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers.normalization import BatchNormalization
np.random.seed(0)
print('=== keras model ===')
input_shape = (1,2,1)
input = Input(shape=input_shape)
x = BatchNormalization()(input)
model = Model(inputs=input, outputs=x)
model.compile(loss='mse', optimizer='sgd')
training_in = [np.random.random(size=(10, *input_shape))]
training_out = [np.random.random(size=(10, *input_shape))]
model.fit(training_in, training_out, epochs=100, verbose=0)
data_in = [[[1.0], [5.0]]]
data_model = np.array([data_in])
result = model.predict(data_model)
gamma = K.eval(model.layers[1].gamma)
beta = K.eval(model.layers[1].beta)
moving_mean = K.eval(model.layers[1].moving_mean)
moving_variance = K.eval(model.layers[1].moving_variance)
epsilon = model.layers[1].epsilon
print('gamma: ', gamma)
print('beta: ', beta)
print('moving_mean: ', moving_mean)
print('moving_variance:', moving_variance)
print('epsilon: ', epsilon)
print('data_in: ', data_in)
print('result: ', result)
print('=== numpy ===')
np_data = [data_in[0][0][0], data_in[0][1][0]]
np_mean = moving_mean[0]
np_variance = moving_variance[0]
np_offset = beta[0]
np_scale = gamma[0]
np_result = [np_scale * (x - np_mean) / math.sqrt(np_variance + epsilon) + np_offset for x in np_data]
print(np_result)
print('=== tensorflow ===')
tf_data = tf.constant(data_in)
tf_mean = tf.constant(moving_mean)
tf_variance = tf.constant(moving_variance)
tf_offset = tf.constant(beta)
tf_scale = tf.constant(gamma)
tf_variance_epsilon = epsilon
tf_result = tf.nn.batch_normalization(tf_data, tf_mean, tf_variance, tf_offset, tf_scale, tf_variance_epsilon)
tf_sess = tf.Session()
print(tf_sess.run(tf_result))
print('=== keras backend ===')
k_data = K.constant(data_in)
k_mean = K.constant(moving_mean)
k_variance = K.constant(moving_variance)
k_offset = K.constant(beta)
k_scale = K.constant(gamma)
k_variance_epsilon = epsilon
k_result = K.batch_normalization(k_data, k_mean, k_variance, k_offset, k_scale, k_variance_epsilon)
print(K.eval(k_result))
输出:
gamma: [ 0.22297101]
beta: [ 0.49253803]
moving_mean: [ 0.36868709]
moving_variance: [ 0.41429576]
epsilon: 0.001
data_in: [[[1.0], [5.0]]]
result: [[[[ 0.71096909]
[ 2.09494853]]]]
=== numpy ===
[0.71096905498374263, 2.0949484904433255]
=== tensorflow ===
[[[ 0.71096909]
[ 2.09494853]]]
=== keras backend ===
[[[ 0.71096909]
[ 2.09494853]]]
给定输入值 [1, 5]
并对其进行归一化,应该会产生类似 [-1, 1]
if I understand correctly 的结果,因为
mean = 3
var = 4
result = (x - mean) / sqrt(var)
但是这个最小的例子
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input
from keras.layers.normalization import BatchNormalization
from keras import backend as K
shape = (1,2,1)
input = Input(shape=shape)
x = BatchNormalization(center=False)(input) # no beta
model = Model(inputs=input, outputs=x)
model.compile(loss='mse', optimizer='sgd')
# training with dummy data
training_in = [np.random.random(size=(10, *shape))]
training_out = [np.random.random(size=(10, *shape))]
model.fit(training_in, training_out, epochs=10)
data_in = np.array([[[[1], [5]]]], dtype=np.float32)
data_out = model.predict(data_in)
print('gamma :', K.eval(model.layers[1].gamma))
#print('beta :', K.eval(model.layers[1].beta))
print('moving_mean:', K.eval(model.layers[1].moving_mean))
print('moving_variance:', K.eval(model.layers[1].moving_variance))
print('epsilon :', model.layers[1].epsilon)
print('data_in :', data_in)
print('data_out:', data_out)
产生以下输出:
gamma : [ 0.80644524]
moving_mean: [ 0.05885344]
moving_variance: [ 0.91000736]
epsilon : 0.001
data_in : [[[[ 1.]
[ 5.]]]]
data_out: [[[[ 0.79519051]
[ 4.17485714]]]]
所以是 [0.79519051, 4.17485714]
而不是 [-1, 1]
。
我看了一下source, and the values seem to be forwarded to tf.nn.batch_normalization. And this looks好像结果应该是我排除的结果,但显然不是。
那么输出值是如何计算的?
如果您使用 gamma
,正确的等式实际上是 result = gamma * (x - mean) / sqrt(var)
用于批量归一化,BUT mean
和 var
并不总是相同的:
在训练(拟合)期间,它们是
mean_batch
和var_batch
使用批次的输入值计算的(它们只是批次的均值和方差)) ,就像你在做的那样。同时,一个全局的moving_mean
和moving_variance
是这样学习的:moving_mean = alpha * moving_mean + (1-alpha) * mean_batch
,alpha是一种学习率,在(0,1),通常在0.9以上。moving_mean
和moving_variance
是所有 训练 数据的真实均值和方差的近似值。Gamma
也通过通常的梯度下降来学习,以最适合您的输出。在推理(预测)时,你只是使用
moving_mean
和moving_variance
的学习值,而不是mean_batch
和var_batch
。你也用学过的gamma
。
所以 0.05885344
只是随机输入数据的均值的近似值,0.91000736
的方差,您正在使用这些来规范化新数据 [1, 5]。您可以轻松检查 [0.79519051, 4.17485714]=gamma * ([1, 5] - moving_mean)/sqrt(moving_var)
编辑:alpha
在keras中叫做动量,如果你想检查一下。
正确的公式是这样的:
result = gamma * (input - moving_mean) / sqrt(moving_variance + epsilon) + beta
这里是验证脚本:
import math
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers.normalization import BatchNormalization
np.random.seed(0)
print('=== keras model ===')
input_shape = (1,2,1)
input = Input(shape=input_shape)
x = BatchNormalization()(input)
model = Model(inputs=input, outputs=x)
model.compile(loss='mse', optimizer='sgd')
training_in = [np.random.random(size=(10, *input_shape))]
training_out = [np.random.random(size=(10, *input_shape))]
model.fit(training_in, training_out, epochs=100, verbose=0)
data_in = [[[1.0], [5.0]]]
data_model = np.array([data_in])
result = model.predict(data_model)
gamma = K.eval(model.layers[1].gamma)
beta = K.eval(model.layers[1].beta)
moving_mean = K.eval(model.layers[1].moving_mean)
moving_variance = K.eval(model.layers[1].moving_variance)
epsilon = model.layers[1].epsilon
print('gamma: ', gamma)
print('beta: ', beta)
print('moving_mean: ', moving_mean)
print('moving_variance:', moving_variance)
print('epsilon: ', epsilon)
print('data_in: ', data_in)
print('result: ', result)
print('=== numpy ===')
np_data = [data_in[0][0][0], data_in[0][1][0]]
np_mean = moving_mean[0]
np_variance = moving_variance[0]
np_offset = beta[0]
np_scale = gamma[0]
np_result = [np_scale * (x - np_mean) / math.sqrt(np_variance + epsilon) + np_offset for x in np_data]
print(np_result)
print('=== tensorflow ===')
tf_data = tf.constant(data_in)
tf_mean = tf.constant(moving_mean)
tf_variance = tf.constant(moving_variance)
tf_offset = tf.constant(beta)
tf_scale = tf.constant(gamma)
tf_variance_epsilon = epsilon
tf_result = tf.nn.batch_normalization(tf_data, tf_mean, tf_variance, tf_offset, tf_scale, tf_variance_epsilon)
tf_sess = tf.Session()
print(tf_sess.run(tf_result))
print('=== keras backend ===')
k_data = K.constant(data_in)
k_mean = K.constant(moving_mean)
k_variance = K.constant(moving_variance)
k_offset = K.constant(beta)
k_scale = K.constant(gamma)
k_variance_epsilon = epsilon
k_result = K.batch_normalization(k_data, k_mean, k_variance, k_offset, k_scale, k_variance_epsilon)
print(K.eval(k_result))
输出:
gamma: [ 0.22297101]
beta: [ 0.49253803]
moving_mean: [ 0.36868709]
moving_variance: [ 0.41429576]
epsilon: 0.001
data_in: [[[1.0], [5.0]]]
result: [[[[ 0.71096909]
[ 2.09494853]]]]
=== numpy ===
[0.71096905498374263, 2.0949484904433255]
=== tensorflow ===
[[[ 0.71096909]
[ 2.09494853]]]
=== keras backend ===
[[[ 0.71096909]
[ 2.09494853]]]