使用 Tensorflow 中的 GRU 将先前时间步长的梯度传递到当前时间步长
Carrying gradients from previous time steps to current time steps with GRU in Tensorflow
我在tensorflow中有以下模型:
def output_layer(input_layer, num_labels):
'''
:param input_layer: 2D tensor
:param num_labels: int. How many output labels in total? (10 for cifar10 and 100 for cifar100)
:return: output layer Y = WX + B
'''
input_dim = input_layer.get_shape().as_list()[-1]
fc_w = create_variables(name='fc_weights', shape=[input_dim, num_labels],
initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
fc_b = create_variables(name='fc_bias', shape=[num_labels], initializer=tf.zeros_initializer())
fc_h = tf.matmul(input_layer, fc_w) + fc_b
return fc_h
def model(input_features):
with tf.variable_scope("GRU"):
cell1 = tf.nn.rnn_cell.GRUCell(gru1_cell_size)
cell2 = tf.nn.rnn_cell.GRUCell(gru2_cell_size)
mcell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2], state_is_tuple=False)
# shape=(?, 64 + 32)
initial_state = tf.placeholder(shape=[None, gru1_cell_size + gru2_cell_size], dtype=tf.float32, name="initial_state")
output, new_state = tf.nn.dynamic_rnn(mcell, input_features, dtype=tf.float32, initial_state=initial_state)
with tf.variable_scope("output_reshaped"):
# before, shape: (34, 1768, 32), after, shape: (34 * 1768, 32)
output = tf.reshape(output, shape=[-1, gru2_cell_size])
with tf.variable_scope("output_layer"):
# shape: (34 * 1768, 3)
predictions = output_layer(output, num_labels)
predictions = tf.reshape(predictions, shape=[-1, 100, 3])
return predictions, initial_state, new_state, output
所以我们从代码中可以看出,第一个GRU的单元格大小是64,第二个GRU的单元格大小是32。batch size是34(但这对我来说现在不重要) .输入特征的大小为 200。我尝试通过以下方式计算关于可训练变量的损失梯度:
local_grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
# only the gradients are taken to add them later with the back propagated gradients from previous batch.
local_grads = [grad for grad, var in local_grads_and_vars]
for v in local_grads:
print("v", v)
打印出梯度后,我得到以下信息:
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(264, 128), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(128,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(264, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(96, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(96, 32), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(32,), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/MatMul_grad/tuple/control_dependency_1:0", shape=(32, 3), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/add_grad/tuple/control_dependency_1:0", shape=(3,), dtype=float32)
假设我在第一批训练模型后保存梯度,即在输入形状张量后:(34, 100, 200)
as input_features
"In the model function argument",输出为shape (34 * 100, 3)
,如何在第二个 mini-batch 上反向传播这些梯度?
来自 tf.gradients
的文档
grad_ys
is a list of tensors of the same length as ys
that holds the initial gradients for each y in ys
. When grad_ys
is None, we fill in a tensor of '1's of the shape of y for each y in ys
. A user can provide their own initial grad_ys
to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y).
因此您的 grad_ys
应该是一个与输入 ys
长度相同的列表。
复制您的代码后,我可以将以下内容发送至 运行:
prev_grad_pl = [tf.placeholder(tf.float32, [batch, i]) for i in [64, 32]]
prev_grad_init = {l: np.ones(l.get_shape().as_list()) for l in prev_grad_pl}
prev_grads_val__ = tf.gradients([new_state1, new_state2], [initial_state1, initial_state2], grad_ys=prev_grad_pl)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
feed = {initial_state1: np.zeros([batch, gru1_cell_size]),
initial_state2: np.zeros([batch, gru2_cell_size])}
for k in prev_grad_init:
feed[k] = prev_grad_init[k]
grad1, grad2 = sess.run(prev_grads_val__, feed_dict=feed)
这是使用自定义代码的解决方案:
import tensorflow as tf
import numpy as np
cell_size = 32
seq_length = 1000
time_steps1 = 500
time_steps2 = seq_length - time_steps1
x_t = np.arange(1, seq_length + 1)
x_t_plus_1 = np.arange(2, seq_length + 2)
tf.set_random_seed(123)
m_dtype = tf.float32
input_1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="input_1")
input_2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="input_2")
labels1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="labels_1")
labels2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="labels_2")
labels = tf.concat([labels1, labels2], axis=1, name="labels")
def model(input_feat1, input_feat2):
with tf.variable_scope("GRU"):
cell1 = tf.nn.rnn_cell.GRUCell(cell_size)
cell2 = tf.nn.rnn_cell.GRUCell(cell_size)
initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")
with tf.variable_scope("First50"):
# output1: shape=[1, time_steps1, 32]
output1, new_state1 = tf.nn.dynamic_rnn(cell1, input_feat1, dtype=m_dtype, initial_state=initial_state)
with tf.variable_scope("Second50"):
# output2: shape=[1, time_steps2, 32]
output2, new_state2 = tf.nn.dynamic_rnn(cell2, input_feat2, dtype=m_dtype, initial_state=new_state1)
with tf.variable_scope("output"):
# output shape: [1, time_steps1 + time_steps2, 32] => [1, 100, 32]
output = tf.concat([output1, output2], axis=1)
output = tf.reshape(output, shape=[-1, cell_size])
output = tf.layers.dense(output, units=1)
output = tf.reshape(output, shape=[1, time_steps1 + time_steps2, 1])
with tf.variable_scope("outputs_1_2_reshaped"):
output1 = tf.slice(input_=output, begin=[0, 0, 0], size=[-1, time_steps1, -1])
output2 = tf.slice(input_=output, begin=[0, time_steps1, 0], size=[-1, time_steps2, 1])
print(output.get_shape().as_list(), "1")
print(output1.get_shape().as_list(), "2")
print(output2.get_shape().as_list(), "3")
return output, output1, output2, initial_state, new_state1, new_state2
def loss(output, output1, output2, labels, labels1, labels2):
loss = tf.reduce_sum(tf.sqrt(tf.square(output - labels)))
loss1 = tf.reduce_sum(tf.sqrt(tf.square(output1 - labels1)))
loss2 = tf.reduce_sum(tf.sqrt(tf.square(output2 - labels2)))
return loss, loss1, loss2
def optimize(loss, loss1, loss2, initial_state, new_state1, new_state2):
with tf.name_scope('Optimizer'):
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads1 = tf.gradients(loss2, new_state1)
grads2 = tf.gradients(loss1, initial_state)
grads3 = tf.gradients(new_state1, initial_state, grad_ys=grads1)
grads_wrt_initial_state_1 = tf.add(grads2, grads3)
grads_wrt_initial_state_2 = tf.gradients(loss, initial_state, grad_ys=None)
return grads_wrt_initial_state_1, grads_wrt_initial_state_2
output, output1, output2, initial_state, new_state1, new_state2 = model(input_1, input_2)
loss, loss1, loss2 = loss(output, output1, output2, labels, labels1, labels2)
grads_wrt_initial_state_1, grads_wrt_initial_state_2 = optimize(loss, loss1, loss2, initial_state, new_state1, new_state2)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
in1 = np.reshape(x_t[:time_steps1], newshape=(1, time_steps1, 1))
in2 = np.reshape(x_t[time_steps1:], newshape=(1, time_steps2, 1))
l1 = np.reshape(x_t_plus_1[:time_steps1], newshape=(1, time_steps1, 1))
l2 = np.reshape(x_t_plus_1[time_steps1:], newshape=(1, time_steps2, 1))
i_s = np.zeros([1, cell_size])
t1, t2 = sess.run([grads_wrt_initial_state_1, grads_wrt_initial_state_2], feed_dict={input_1: in1,
input_2: in2,
labels1: l1,
labels2: l2,
initial_state: i_s})
print(np.mean(t1), np.mean(t2))
print(np.sum(t1), np.sum(t2))
这是2个GRU一个接一个的例子,我按照optimize()
中的代码做了2种不同的反向传播
我在tensorflow中有以下模型:
def output_layer(input_layer, num_labels):
'''
:param input_layer: 2D tensor
:param num_labels: int. How many output labels in total? (10 for cifar10 and 100 for cifar100)
:return: output layer Y = WX + B
'''
input_dim = input_layer.get_shape().as_list()[-1]
fc_w = create_variables(name='fc_weights', shape=[input_dim, num_labels],
initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
fc_b = create_variables(name='fc_bias', shape=[num_labels], initializer=tf.zeros_initializer())
fc_h = tf.matmul(input_layer, fc_w) + fc_b
return fc_h
def model(input_features):
with tf.variable_scope("GRU"):
cell1 = tf.nn.rnn_cell.GRUCell(gru1_cell_size)
cell2 = tf.nn.rnn_cell.GRUCell(gru2_cell_size)
mcell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2], state_is_tuple=False)
# shape=(?, 64 + 32)
initial_state = tf.placeholder(shape=[None, gru1_cell_size + gru2_cell_size], dtype=tf.float32, name="initial_state")
output, new_state = tf.nn.dynamic_rnn(mcell, input_features, dtype=tf.float32, initial_state=initial_state)
with tf.variable_scope("output_reshaped"):
# before, shape: (34, 1768, 32), after, shape: (34 * 1768, 32)
output = tf.reshape(output, shape=[-1, gru2_cell_size])
with tf.variable_scope("output_layer"):
# shape: (34 * 1768, 3)
predictions = output_layer(output, num_labels)
predictions = tf.reshape(predictions, shape=[-1, 100, 3])
return predictions, initial_state, new_state, output
所以我们从代码中可以看出,第一个GRU的单元格大小是64,第二个GRU的单元格大小是32。batch size是34(但这对我来说现在不重要) .输入特征的大小为 200。我尝试通过以下方式计算关于可训练变量的损失梯度:
local_grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
# only the gradients are taken to add them later with the back propagated gradients from previous batch.
local_grads = [grad for grad, var in local_grads_and_vars]
for v in local_grads:
print("v", v)
打印出梯度后,我得到以下信息:
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(264, 128), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(128,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(264, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(96, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(96, 32), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(32,), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/MatMul_grad/tuple/control_dependency_1:0", shape=(32, 3), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/add_grad/tuple/control_dependency_1:0", shape=(3,), dtype=float32)
假设我在第一批训练模型后保存梯度,即在输入形状张量后:(34, 100, 200)
as input_features
"In the model function argument",输出为shape (34 * 100, 3)
,如何在第二个 mini-batch 上反向传播这些梯度?
来自 tf.gradients
grad_ys
is a list of tensors of the same length asys
that holds the initial gradients for each y inys
. Whengrad_ys
is None, we fill in a tensor of '1's of the shape of y for each y inys
. A user can provide their own initialgrad_ys
to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y).
因此您的 grad_ys
应该是一个与输入 ys
长度相同的列表。
复制您的代码后,我可以将以下内容发送至 运行:
prev_grad_pl = [tf.placeholder(tf.float32, [batch, i]) for i in [64, 32]]
prev_grad_init = {l: np.ones(l.get_shape().as_list()) for l in prev_grad_pl}
prev_grads_val__ = tf.gradients([new_state1, new_state2], [initial_state1, initial_state2], grad_ys=prev_grad_pl)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
feed = {initial_state1: np.zeros([batch, gru1_cell_size]),
initial_state2: np.zeros([batch, gru2_cell_size])}
for k in prev_grad_init:
feed[k] = prev_grad_init[k]
grad1, grad2 = sess.run(prev_grads_val__, feed_dict=feed)
这是使用自定义代码的解决方案:
import tensorflow as tf
import numpy as np
cell_size = 32
seq_length = 1000
time_steps1 = 500
time_steps2 = seq_length - time_steps1
x_t = np.arange(1, seq_length + 1)
x_t_plus_1 = np.arange(2, seq_length + 2)
tf.set_random_seed(123)
m_dtype = tf.float32
input_1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="input_1")
input_2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="input_2")
labels1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="labels_1")
labels2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="labels_2")
labels = tf.concat([labels1, labels2], axis=1, name="labels")
def model(input_feat1, input_feat2):
with tf.variable_scope("GRU"):
cell1 = tf.nn.rnn_cell.GRUCell(cell_size)
cell2 = tf.nn.rnn_cell.GRUCell(cell_size)
initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")
with tf.variable_scope("First50"):
# output1: shape=[1, time_steps1, 32]
output1, new_state1 = tf.nn.dynamic_rnn(cell1, input_feat1, dtype=m_dtype, initial_state=initial_state)
with tf.variable_scope("Second50"):
# output2: shape=[1, time_steps2, 32]
output2, new_state2 = tf.nn.dynamic_rnn(cell2, input_feat2, dtype=m_dtype, initial_state=new_state1)
with tf.variable_scope("output"):
# output shape: [1, time_steps1 + time_steps2, 32] => [1, 100, 32]
output = tf.concat([output1, output2], axis=1)
output = tf.reshape(output, shape=[-1, cell_size])
output = tf.layers.dense(output, units=1)
output = tf.reshape(output, shape=[1, time_steps1 + time_steps2, 1])
with tf.variable_scope("outputs_1_2_reshaped"):
output1 = tf.slice(input_=output, begin=[0, 0, 0], size=[-1, time_steps1, -1])
output2 = tf.slice(input_=output, begin=[0, time_steps1, 0], size=[-1, time_steps2, 1])
print(output.get_shape().as_list(), "1")
print(output1.get_shape().as_list(), "2")
print(output2.get_shape().as_list(), "3")
return output, output1, output2, initial_state, new_state1, new_state2
def loss(output, output1, output2, labels, labels1, labels2):
loss = tf.reduce_sum(tf.sqrt(tf.square(output - labels)))
loss1 = tf.reduce_sum(tf.sqrt(tf.square(output1 - labels1)))
loss2 = tf.reduce_sum(tf.sqrt(tf.square(output2 - labels2)))
return loss, loss1, loss2
def optimize(loss, loss1, loss2, initial_state, new_state1, new_state2):
with tf.name_scope('Optimizer'):
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads1 = tf.gradients(loss2, new_state1)
grads2 = tf.gradients(loss1, initial_state)
grads3 = tf.gradients(new_state1, initial_state, grad_ys=grads1)
grads_wrt_initial_state_1 = tf.add(grads2, grads3)
grads_wrt_initial_state_2 = tf.gradients(loss, initial_state, grad_ys=None)
return grads_wrt_initial_state_1, grads_wrt_initial_state_2
output, output1, output2, initial_state, new_state1, new_state2 = model(input_1, input_2)
loss, loss1, loss2 = loss(output, output1, output2, labels, labels1, labels2)
grads_wrt_initial_state_1, grads_wrt_initial_state_2 = optimize(loss, loss1, loss2, initial_state, new_state1, new_state2)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
in1 = np.reshape(x_t[:time_steps1], newshape=(1, time_steps1, 1))
in2 = np.reshape(x_t[time_steps1:], newshape=(1, time_steps2, 1))
l1 = np.reshape(x_t_plus_1[:time_steps1], newshape=(1, time_steps1, 1))
l2 = np.reshape(x_t_plus_1[time_steps1:], newshape=(1, time_steps2, 1))
i_s = np.zeros([1, cell_size])
t1, t2 = sess.run([grads_wrt_initial_state_1, grads_wrt_initial_state_2], feed_dict={input_1: in1,
input_2: in2,
labels1: l1,
labels2: l2,
initial_state: i_s})
print(np.mean(t1), np.mean(t2))
print(np.sum(t1), np.sum(t2))
这是2个GRU一个接一个的例子,我按照optimize()