无法让 GradientTape 给出非空结果
Cannot get GradientTape to give non null results
我正在尝试使用 tensorflow2 手动实现一个非常简单的 RNN。我在 tensorflow 网站上的 the example to manually make models 上为我的代码建模。为此目的,代码被剥离到最基本的部分,是
class ModelSimple(object):
def __init__(self):
# Initialize the weights to `5.0` and the bias to `0.0`
# In practice, these should be initialized to random values (for example, with `tf.random.normal`)
self.W = tf.Variable(tf.random.normal([]))
self.b = tf.Variable(tf.random.normal([]))
def __call__(self, x):
return self.W * x + self.b
def loss(predicted_y, target_y):
return tf.reduce_mean(tf.square(predicted_y - target_y))
NUM_EXAMPLES = 1000
inputs = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = tf.zeros(NUM_EXAMPLES)
model = ModelSimple()
with tf.GradientTape() as t:
t.watch([model.W,model.b])
current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
print(dW,db)
这为 dW 和 db 提供了很好的张量。然后我尝试做我上面描述的
class ModelRNN(object):
def __init__(self, n_inputs, n_neurons):
self.n_inputs = n_inputs
self.n_neurons = n_neurons
# weights for new input
self.Wx = tf.Variable(tf.random.normal(shape=[self.n_inputs, self.n_neurons], dtype=tf.float32))
# weights for previous output
self.Wy = tf.Variable(tf.random.normal(shape=[self.n_neurons, self.n_neurons], dtype=tf.float32))
# bias weights
self.b = tf.Variable(tf.zeros([1, self.n_neurons], dtype=tf.float32))
def __call__(self, X_batch):
# get shape of input
batch_size, num_time_steps, _ = X_batch.get_shape()
# we will loop through the time steps and the output of the previous computation feeds into
# the next one.
# this variable keeps track of it and is initialized to zero
y_last = tf.Variable(tf.zeros([batch_size, self.n_neurons], dtype=tf.float32))
# the outputs will be stored in this tensor
Ys = tf.Variable(tf.zeros([batch_size, num_time_steps, self.n_neurons], dtype=tf.float32))
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys[:, t, :].assign(yt)
return Ys
inputs = tf.convert_to_tensor(np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
],dtype=np.float32))
outputs=tf.Variable(tf.zeros((4,2,5),dtype=np.float32))
model = ModelRNN(3, 5)
with tf.GradientTape() as t:
t.watch([model.Wx,model.Wy,model.b])
current_loss = loss(model(inputs), outputs)
dWx,dWy,db = t.gradient(current_loss, [model.Wx, model.Wy,model.b])
print(dWx,dWy,db)
结果dWx,dWy,db 都是None。我已经尝试了几件事(包括使用 GradientTape 观察它们,尽管它们是变量)但我一直得到 None。我做错了什么?
看起来这与这个问题有关:
用 python 列表和 tf.stack
替换 assign
会导致返回梯度
Ys = []
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys.append(yt)
return tf.stack(Ys,axis=1)
我正在尝试使用 tensorflow2 手动实现一个非常简单的 RNN。我在 tensorflow 网站上的 the example to manually make models 上为我的代码建模。为此目的,代码被剥离到最基本的部分,是
class ModelSimple(object):
def __init__(self):
# Initialize the weights to `5.0` and the bias to `0.0`
# In practice, these should be initialized to random values (for example, with `tf.random.normal`)
self.W = tf.Variable(tf.random.normal([]))
self.b = tf.Variable(tf.random.normal([]))
def __call__(self, x):
return self.W * x + self.b
def loss(predicted_y, target_y):
return tf.reduce_mean(tf.square(predicted_y - target_y))
NUM_EXAMPLES = 1000
inputs = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = tf.zeros(NUM_EXAMPLES)
model = ModelSimple()
with tf.GradientTape() as t:
t.watch([model.W,model.b])
current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
print(dW,db)
这为 dW 和 db 提供了很好的张量。然后我尝试做我上面描述的
class ModelRNN(object):
def __init__(self, n_inputs, n_neurons):
self.n_inputs = n_inputs
self.n_neurons = n_neurons
# weights for new input
self.Wx = tf.Variable(tf.random.normal(shape=[self.n_inputs, self.n_neurons], dtype=tf.float32))
# weights for previous output
self.Wy = tf.Variable(tf.random.normal(shape=[self.n_neurons, self.n_neurons], dtype=tf.float32))
# bias weights
self.b = tf.Variable(tf.zeros([1, self.n_neurons], dtype=tf.float32))
def __call__(self, X_batch):
# get shape of input
batch_size, num_time_steps, _ = X_batch.get_shape()
# we will loop through the time steps and the output of the previous computation feeds into
# the next one.
# this variable keeps track of it and is initialized to zero
y_last = tf.Variable(tf.zeros([batch_size, self.n_neurons], dtype=tf.float32))
# the outputs will be stored in this tensor
Ys = tf.Variable(tf.zeros([batch_size, num_time_steps, self.n_neurons], dtype=tf.float32))
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys[:, t, :].assign(yt)
return Ys
inputs = tf.convert_to_tensor(np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
],dtype=np.float32))
outputs=tf.Variable(tf.zeros((4,2,5),dtype=np.float32))
model = ModelRNN(3, 5)
with tf.GradientTape() as t:
t.watch([model.Wx,model.Wy,model.b])
current_loss = loss(model(inputs), outputs)
dWx,dWy,db = t.gradient(current_loss, [model.Wx, model.Wy,model.b])
print(dWx,dWy,db)
结果dWx,dWy,db 都是None。我已经尝试了几件事(包括使用 GradientTape 观察它们,尽管它们是变量)但我一直得到 None。我做错了什么?
看起来这与这个问题有关:
用 python 列表和 tf.stack
替换 assign
会导致返回梯度
Ys = []
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys.append(yt)
return tf.stack(Ys,axis=1)