无法让 GradientTape 给出非空结果

Cannot get GradientTape to give non null results

我正在尝试使用 tensorflow2 手动实现一个非常简单的 RNN。我在 tensorflow 网站上的 the example to manually make models 上为我的代码建模。为此目的,代码被剥离到最基本的部分,是

class ModelSimple(object):
    def __init__(self):
        # Initialize the weights to `5.0` and the bias to `0.0`
        # In practice, these should be initialized to random values (for example, with `tf.random.normal`)
        self.W = tf.Variable(tf.random.normal([]))
        self.b = tf.Variable(tf.random.normal([]))

    def __call__(self, x):
        return self.W * x + self.b

def loss(predicted_y, target_y):
    return tf.reduce_mean(tf.square(predicted_y - target_y))


NUM_EXAMPLES = 1000

inputs  = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = tf.zeros(NUM_EXAMPLES)
model = ModelSimple()

with tf.GradientTape() as t:
    t.watch([model.W,model.b])
    current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
print(dW,db)

这为 dW 和 db 提供了很好的张量。然后我尝试做我上面描述的

class ModelRNN(object):
    def __init__(self, n_inputs, n_neurons):
        self.n_inputs = n_inputs
        self.n_neurons = n_neurons

        # weights for new input
        self.Wx = tf.Variable(tf.random.normal(shape=[self.n_inputs, self.n_neurons], dtype=tf.float32))

        # weights for previous output
        self.Wy = tf.Variable(tf.random.normal(shape=[self.n_neurons, self.n_neurons], dtype=tf.float32))

        # bias weights
        self.b = tf.Variable(tf.zeros([1, self.n_neurons], dtype=tf.float32))

    def __call__(self, X_batch):
        # get shape of input
        batch_size, num_time_steps, _ = X_batch.get_shape()

        # we will loop through the time steps and the output of the previous computation feeds into
        # the next one.
        # this variable keeps track of it and is initialized to zero
        y_last = tf.Variable(tf.zeros([batch_size, self.n_neurons], dtype=tf.float32))

        # the outputs will be stored in this tensor
        Ys = tf.Variable(tf.zeros([batch_size, num_time_steps, self.n_neurons], dtype=tf.float32))

        for t in range(num_time_steps):
            Xt = X_batch[:, t, :]
            yt = tf.tanh(tf.matmul(y_last, self.Wy) +
                         tf.matmul(Xt, self.Wx) +
                         self.b)
            y_last.assign(yt)
            Ys[:, t, :].assign(yt)

        return Ys




inputs = tf.convert_to_tensor(np.array([
        # t = 0      t = 1
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [0, 0, 0]], # instance 2
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ],dtype=np.float32))
outputs=tf.Variable(tf.zeros((4,2,5),dtype=np.float32))

model = ModelRNN(3, 5)

with tf.GradientTape() as t:
    t.watch([model.Wx,model.Wy,model.b])
    current_loss = loss(model(inputs), outputs)

dWx,dWy,db = t.gradient(current_loss, [model.Wx, model.Wy,model.b])
print(dWx,dWy,db)

结果dWx,dWy,db 都是None。我已经尝试了几件事(包括使用 GradientTape 观察它们,尽管它们是变量)但我一直得到 None。我做错了什么?

看起来这与这个问题有关:

用 python 列表和 tf.stack 替换 assign 会导致返回梯度

    Ys = []
    for t in range(num_time_steps):
        Xt = X_batch[:, t, :]
        yt = tf.tanh(tf.matmul(y_last, self.Wy) +
                     tf.matmul(Xt, self.Wx) +
                     self.b)
        y_last.assign(yt)
        Ys.append(yt)

    return tf.stack(Ys,axis=1)