Tensorflow错误偏差初始化
Tensorflow wrong bias initialization
我正在使用相同的函数
初始化我的 2 组 weights/bias
第 1 组:
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
第二组:
W = tf.Variable(tf.random_uniform([input_dim, output_dim], -0.1, 0.1), name='W_post_attn')
b = tf.Variable(tf.random_uniform([output_dim], -0.1, 0.1), name='b_post_attn')
但是 Tensorboard 中的直方图显示第二组偏差分布不均匀(二元分布以 +/-0.06 为中心,见下图)。
知道是什么原因造成的吗?
在 jupyter notebook 上使用 MNIST 数据和 运行 添加虚拟代码。请注意,我的原始代码是二进制分类,而 MNIST 有 10 类。输出偏差(在最后一层)的峰值数量似乎与输出数量相关类(见下图)。
from __future__ import division, print_function, unicode_literals
from functools import partial
import numpy as np
import os
import tensorflow as tf
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
path = '/your_folder/'
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/mnist/tmp/data/")
reset_graph()
def variable_summaries(var, name):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope(name):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
n_inputs = 28*28 # MNIST
n_hidden1 = 200
n_outputs = 10
learning_rate = 0.01
n_epochs = 50
batch_size = 50
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
def attention(inputs, attention_size, name):
hidden_size = int(inputs.get_shape()[1])
# Trainable parameters
with tf.name_scope(name):
with tf.name_scope('Attention_variables'):
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
u_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='u_omega')
variable_summaries(W_omega, 'W_omega')
variable_summaries(b_omega, 'b_omega')
with tf.name_scope('Attention_u_it'):
v = tf.tanh(tf.tensordot(inputs, W_omega, axes=[[1], [0]]) + b_omega, name='u_it')
with tf.name_scope('Attention_alpha_it'):
vu = tf.tensordot(v, u_omega, axes=[[1], [0]], name='u_it_u_w')
alphas = tf.nn.softmax(vu, name='alphas')
with tf.name_scope('Attention_output'):
#output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1, name='attention_output')
output = inputs * tf.expand_dims(alphas, -1)
return output
def neuron_layer(X, n_neurons, name, activation=None):
with tf.name_scope(name):
n_inputs = int(X.get_shape()[1])
W = tf.Variable(tf.random_uniform([n_inputs, n_neurons], -0.1, 0.1), name='W')
b = tf.Variable(tf.random_uniform([n_neurons], -0.1, 0.1), name='b')
variable_summaries(W, 'W')
variable_summaries(b, 'b')
if activation is not None:
return activation(tf.matmul(X, W) + b)
else:
return tf.matmul(X, W) + b
with tf.name_scope("dnn"):
hidden1 = attention(X, n_hidden1, name="hidden1_attn")
logits = neuron_layer(hidden1, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
opt = tf.train.GradientDescentOptimizer(learning_rate)
training_op = opt.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
#tensorboard saving parameters
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "/tensorboard_files/"
logdir = "{}/run-{}/".format(root_logdir, now)
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(logdir+'/train', tf.get_default_graph())
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
if epoch%2==0:
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
#tensorboard summary
summary = sess.run(merged, feed_dict={X: X_batch, y: y_batch})
train_writer.add_summary(summary, epoch)
acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
y: mnist.validation.labels})
print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)
save_path = saver.save(sess, path+"my_model_final.ckpt")
定义直方图外观的关键值是 随机向量的大小 ,在您的例子中,它是 attention_size=200
和 n_neurons=10
(或第一个片段中的 output_dim=2
)。显然,样本量越大,样本越接近均匀。这就是 b_omega
的分布比 b
.
的分布看起来更均匀的原因
设置 attention_size=10
,您将看到以下内容:
此图表背后的实际 b
值是(注意 0.01
处的峰值):
[ 0.05595738 0.01231904 -0.08605836 0.01057353 -0.03015073 -0.04255719
0.04719915 0.01116617 -0.0672287 -0.00013051]
实际 b_omega
值为:
[-0.06326838 -0.09758444 0.06982093 0.01574633 0.0039237 0.07463291
0.02308519 0.04594345 0.07912541 0.00175323]
另请注意,每个时期的分布都是相同的(即图表深度),因为从来没有更新过权重和偏差。
底线:初始化是正确的,但如果不注意变量的形状,直方图可能会造成混淆。
我正在使用相同的函数
初始化我的 2 组 weights/bias第 1 组:
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
第二组:
W = tf.Variable(tf.random_uniform([input_dim, output_dim], -0.1, 0.1), name='W_post_attn')
b = tf.Variable(tf.random_uniform([output_dim], -0.1, 0.1), name='b_post_attn')
但是 Tensorboard 中的直方图显示第二组偏差分布不均匀(二元分布以 +/-0.06 为中心,见下图)。
知道是什么原因造成的吗?
在 jupyter notebook 上使用 MNIST 数据和 运行 添加虚拟代码。请注意,我的原始代码是二进制分类,而 MNIST 有 10 类。输出偏差(在最后一层)的峰值数量似乎与输出数量相关类(见下图)。
from __future__ import division, print_function, unicode_literals
from functools import partial
import numpy as np
import os
import tensorflow as tf
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
path = '/your_folder/'
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/mnist/tmp/data/")
reset_graph()
def variable_summaries(var, name):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope(name):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
n_inputs = 28*28 # MNIST
n_hidden1 = 200
n_outputs = 10
learning_rate = 0.01
n_epochs = 50
batch_size = 50
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
def attention(inputs, attention_size, name):
hidden_size = int(inputs.get_shape()[1])
# Trainable parameters
with tf.name_scope(name):
with tf.name_scope('Attention_variables'):
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
u_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='u_omega')
variable_summaries(W_omega, 'W_omega')
variable_summaries(b_omega, 'b_omega')
with tf.name_scope('Attention_u_it'):
v = tf.tanh(tf.tensordot(inputs, W_omega, axes=[[1], [0]]) + b_omega, name='u_it')
with tf.name_scope('Attention_alpha_it'):
vu = tf.tensordot(v, u_omega, axes=[[1], [0]], name='u_it_u_w')
alphas = tf.nn.softmax(vu, name='alphas')
with tf.name_scope('Attention_output'):
#output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1, name='attention_output')
output = inputs * tf.expand_dims(alphas, -1)
return output
def neuron_layer(X, n_neurons, name, activation=None):
with tf.name_scope(name):
n_inputs = int(X.get_shape()[1])
W = tf.Variable(tf.random_uniform([n_inputs, n_neurons], -0.1, 0.1), name='W')
b = tf.Variable(tf.random_uniform([n_neurons], -0.1, 0.1), name='b')
variable_summaries(W, 'W')
variable_summaries(b, 'b')
if activation is not None:
return activation(tf.matmul(X, W) + b)
else:
return tf.matmul(X, W) + b
with tf.name_scope("dnn"):
hidden1 = attention(X, n_hidden1, name="hidden1_attn")
logits = neuron_layer(hidden1, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
opt = tf.train.GradientDescentOptimizer(learning_rate)
training_op = opt.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
#tensorboard saving parameters
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "/tensorboard_files/"
logdir = "{}/run-{}/".format(root_logdir, now)
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(logdir+'/train', tf.get_default_graph())
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
if epoch%2==0:
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
#tensorboard summary
summary = sess.run(merged, feed_dict={X: X_batch, y: y_batch})
train_writer.add_summary(summary, epoch)
acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
y: mnist.validation.labels})
print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)
save_path = saver.save(sess, path+"my_model_final.ckpt")
定义直方图外观的关键值是 随机向量的大小 ,在您的例子中,它是 attention_size=200
和 n_neurons=10
(或第一个片段中的 output_dim=2
)。显然,样本量越大,样本越接近均匀。这就是 b_omega
的分布比 b
.
设置 attention_size=10
,您将看到以下内容:
此图表背后的实际 b
值是(注意 0.01
处的峰值):
[ 0.05595738 0.01231904 -0.08605836 0.01057353 -0.03015073 -0.04255719
0.04719915 0.01116617 -0.0672287 -0.00013051]
实际 b_omega
值为:
[-0.06326838 -0.09758444 0.06982093 0.01574633 0.0039237 0.07463291
0.02308519 0.04594345 0.07912541 0.00175323]
另请注意,每个时期的分布都是相同的(即图表深度),因为从来没有更新过权重和偏差。
底线:初始化是正确的,但如果不注意变量的形状,直方图可能会造成混淆。