GRU 相同的配置但以两种不同的方式在张量流中产生两种不同的输出

Question

我想使用 GRU 在 tensorflow 中做一些序列预测。所以我用以下两种不同的方式创建了相同的模型：

在模型 1 中我有 2 个 GRU，一个接一个，即 new_state1，第一个 GRU 的最终隐藏状态，作为第二个 GRU 的初始状态。因此，模型相应地输出 new_state1 和 new_state2。请注意，这不是 2 层模型，而只是 1 层。从下面的代码中，我将输入和输出分为两部分，其中 GRU1 占据第一部分，第二个 GRU 占据第二部分。

还为两个模型设置并修复了 random_seed，以便结果具有可比性。

模型 1

import tensorflow as tf
import numpy as np

cell_size = 32

seq_length = 1000

time_steps1 = 500
time_steps2 = seq_length - time_steps1

x_t = np.arange(1, seq_length + 1)    
x_t_plus_1 = np.arange(2, seq_length + 2)

tf.set_random_seed(123)

m_dtype = tf.float32

input_1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="input_1")
input_2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="input_2")

labels1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="labels_1")
labels2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="labels_2")

labels = tf.concat([labels1, labels2], axis=1, name="labels")

initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")

def model(input_feat1, input_feat2):
    with tf.variable_scope("GRU"):
        cell1 = tf.nn.rnn_cell.GRUCell(cell_size)
        cell2 = tf.nn.rnn_cell.GRUCell(cell_size)

        with tf.variable_scope("First50"):
            # output1: shape=[1, time_steps1, 32]
            output1, new_state1 = tf.nn.dynamic_rnn(cell1, input_feat1, dtype=m_dtype, initial_state=initial_state)

        with tf.variable_scope("Second50"):
            # output2: shape=[1, time_steps2, 32]
            output2, new_state2 = tf.nn.dynamic_rnn(cell2, input_feat2, dtype=m_dtype, initial_state=new_state1)

        with tf.variable_scope("output"):
            # output shape: [1, time_steps1 + time_steps2, 32] => [1, 100, 32]
            output = tf.concat([output1, output2], axis=1)

            output = tf.reshape(output, shape=[-1, cell_size])
            output = tf.layers.dense(output, units=1)
            output = tf.reshape(output, shape=[1, time_steps1 + time_steps2, 1])

        with tf.variable_scope("outputs_1_2_reshaped"):
            output1 = tf.slice(input_=output, begin=[0, 0, 0], size=[-1, time_steps1, -1])
            output2 = tf.slice(input_=output, begin=[0, time_steps1, 0], size=[-1, time_steps2, 1])

            print(output.get_shape().as_list(), "1")
            print(output1.get_shape().as_list(), "2")
            print(output2.get_shape().as_list(), "3")

            return output, output1, output2, initial_state, new_state1, new_state2

output, output1, output2, initial_state, new_state1, new_state2 = model(input_1, input_2)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    to_run_list = [new_state1, new_state2]

    in1 = np.reshape(x_t[:time_steps1], newshape=(1, time_steps1, 1))
    in2 = np.reshape(x_t[time_steps1:], newshape=(1, time_steps2, 1))
    l1 = np.reshape(x_t_plus_1[:time_steps1], newshape=(1, time_steps1, 1))
    l2 = np.reshape(x_t_plus_1[time_steps1:], newshape=(1, time_steps2, 1))
    i_s = np.zeros([1, cell_size])

    new_s1, new_s2 = sess.run(to_run_list, feed_dict={input_1: in1,
                                                              input_2: in2,
                                                              labels1: l1,
                                                              labels2: l2,
                                                              initial_state: i_s})

    print(np.shape(new_s1), np.shape(new_s2))

    print(np.mean(new_s1), np.mean(new_s2))
    print(np.sum(new_s1), np.sum(new_s2))

在这个模型中，我没有使用 2 个不同的 GRU，而是创建了一个，并将输入和标签也分成了 2 个不同的部分，并且我使用了 for 循环来迭代我的输入数据集。然后获取最终状态并将其反馈到与初始状态相同的模型中。

请注意，model1 和 model2 的第一个初始状态都是零。

模型 2

import tensorflow as tf
import numpy as np

cell_size = 32

seq_length = 1000

time_steps = 500

x_t = np.arange(1, seq_length + 1)    
x_t_plus_1 = np.arange(2, seq_length + 2)

tf.set_random_seed(123)

m_dtype = tf.float32

inputs = tf.placeholder(dtype=m_dtype, shape=[None, time_steps, 1], name="inputs")

labels = tf.placeholder(dtype=m_dtype, shape=[None, time_steps, 1], name="labels")

initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")

grads_initial_state = tf.placeholder(dtype=m_dtype, shape=[None, cell_size], name="prev_grads")

this_is_last_batch = tf.placeholder(dtype=tf.bool, name="this_is_last_batch")

def model(input_feat):
    with tf.variable_scope("GRU"):
        cell = tf.nn.rnn_cell.GRUCell(cell_size)

        with tf.variable_scope("cell"):
            # output1: shape=[1, time_steps, 32]
            output, new_state = tf.nn.dynamic_rnn(cell, input_feat, dtype=m_dtype, initial_state=initial_state)

        with tf.variable_scope("output"):

            output = tf.reshape(output, shape=[-1, cell_size])
            output = tf.layers.dense(output, units=1)
            output = tf.reshape(output, shape=[1, time_steps, 1])

            print(output.get_shape().as_list(), "1")

            return output, new_state

output, new_state = model(inputs)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    # 1000 // 500 = 2
    num_iterations = seq_length // time_steps
    print("num_iterations:", num_iterations)

    final_states = []
    to_run_list = [grads_wrt_initial_state, new_state]

    for i in range(num_iterations):

        current_xt = x_t[i * time_steps: (i + 1)*time_steps]
        current_xt_plus_1 = x_t_plus_1[i*time_steps: (i + 1)*time_steps]

        in1 = np.reshape(current_xt, newshape=(1, time_steps, 1))
        l1 = np.reshape(current_xt_plus_1, newshape=(1, time_steps, 1))
        i_s = np.zeros([1, cell_size])

        if i == 0:
            new_s = sess.run(new_state, feed_dict={inputs: in1,
                                                   labels: l1,
                                                   initial_state: i_s})
            final_states.append(new_s)
            print("---->", np.mean(final_states[-1]), np.sum(final_states[-1]), i)
        else:
            new_s = sess.run(new_state, feed_dict={inputs: in1,
                                                   labels: l1,
                                                   initial_state: final_states[-1]})
            final_states.append(new_s)
            print("---->", np.mean(final_states[-1]), np.sum(final_states[-1]), i)

最后，在model1中打印出new_state1和new_state2的统计数据后，每次迭代后，在model2中都与new_state不同

我想知道如何解决这个问题以及为什么会这样。

编辑：

我发现两个文件中gru的权重值不同

现在如何在设置随机种子后在 2 个不同的文件中重现相同的结果？

非常感谢任何帮助！！！

Answer 1

所以要在不同的文件中重现相同的结果，tf.set_random_seed() 是不够的。我发现我们还需要为 gru 单元格的 intializers 设置种子以及 dense 层中权重的 initializers 在输出（这至少是根据我的模型）；所以单元格的定义现在是：

cell1 = tf.nn.rnn_cell.GRUCell(cell_size, kernel_initializer=tf.glorot_normal_initializer(seed=123, dtype=m_dtype))

对于密集层：

output = tf.layers.dense(output, units=1, kernel_initializer=tf.glorot_uniform_initializer(seed=123, dtype=m_dtype))

请注意，只要我们为它设置种子 dtype，就可以使用任何其他初始化程序。

GRU 相同的配置但以两种不同的方式在张量流中产生两种不同的输出

GRU same configurations but in two different ways produces two different output in tensorflow

python

tensorflow

rnn

模型 1

模型 2

编辑：