深度 Q_learning - Tensorflow - 权重不会改变
Deep Q_learning - Tensorflow - Weights won't change
我正在尝试编写一个 DQL 算法,我正在尝试 运行 tensorflow 上的下图
class DQN:
def __init__(self, env, n_hidden, learning_rate):
self.image_input = tf.placeholder(shape=[None, 128,128,3], dtype=tf.float32)
self.conv1 = tf.contrib.layers.convolution2d(inputs=self.image_input, num_outputs=32,
kernel_size=[8,8], stride=[4,4], padding="VALID")
self.conv2 = tf.contrib.layers.convolution2d(inputs=self.conv1, num_outputs=64,
kernel_size=[4,4], stride=[2,2], padding="VALID")
self.conv3 = tf.contrib.layers.convolution2d(inputs=self.conv2, num_outputs=64,
kernel_size=[3,3], stride=[1,1], padding="VALID")
self.conv4 = tf.contrib.layers.convolution2d(inputs=self.conv3, num_outputs=512,
kernel_size=[7,7], stride=[1,1], padding="VALID")
self.conv_out = tf.contrib.layers.flatten(self.conv4)
self.weights_1 = tf.Variable(tf.random_normal([18432, env.action_space.n], stddev=0.35), name="fully1_w")
self.bias_1 = tf.Variable(tf.zeros(env.action_space.n), name="fully1_b")
self.q_out = tf.add(tf.matmul(self.conv_out, self.weights_1), self.bias_1, name="q_out")
self.predict = tf.argmax(self.q_out, 1)
self.target_q = tf.placeholder(shape=[None],dtype=tf.float32)
self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions,env.action_space.n,dtype=tf.float32)
self.q_value = tf.reduce_sum(tf.multiply(self.q_out, self.actions_onehot), reduction_indices=1)
self.td_error = tf.square(self.target_q - self.q_value)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
self.grads_and_vars = self.trainer.compute_gradients(self.loss)
self.trainer.apply_gradients(self.grads_and_vars)
这是训练过程:
tf.reset_default_graph()
main_qf = DQN(env, n_hidden=10, learning_rate=1.0)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
target_ops = update_target_graph(trainables,tau, mode="periodically")
grads=[]
experience_buffer = ExperienceReplay(exp_size)
total_rewards = np.zeros(num_episodes)
losses = np.zeros(num_episodes)
with tf.Session() as session:
state = env.reset()
session.run(init)
update_target(target_ops, session)
for _iter in range(num_episodes):
state = env.reset()
# play ===================================================================================
done = False
img = process_image(env.render(mode="rgb_array"))
episode = []
while not done:
#e-greedy
if np.random.rand() < epsilon:
action = np.random.choice(range(env.action_space.n))
else:
feed_dict = {main_qf.image_input: img[None,:,:,:]}
action = session.run(main_qf.predict, feed_dict=feed_dict)[0]
new_state, reward, done, _ = env.step(action)
new_img = process_image(env.render(mode="rgb_array"))
experience_buffer.add((img, action, new_img,reward, done))
# update results =========================================================================
total_rewards[_iter] += reward
# Adjust params (epsilon) ===============================================================
if epsilon >= min_epsilon:
epsilon -= decay
# train ==================================================================================
prev_state, actions, new_state, rewards, is_terminal = experience_buffer.sample(batch_size)
q_function = session.run([main_qf.q_out], feed_dict={
main_qf.image_input:prev_state})
q_target = session.run([main_qf.predict], feed_dict={
main_qf.image_input:new_state})
q_target = rewards + gamma * q_target * is_terminal
loss, weights, grad = session.run([main_qf.loss,main_qf.weights_1, main_qf.grads_and_vars], feed_dict={
main_qf.image_input : prev_state,
main_qf.target_q : q_target,
main_qf.actions : actions
})
losses[_iter] = loss
update_target(target_ops, session)
但出于某种原因,我不明白训练过程没有更新网络的权重。我试图获取梯度以检查我是否有消失的梯度(得到 grads_and_vars),但事实并非如此,梯度具有很大的价值。我还尝试手动为变量赋值(通过调用 main_qf.weights1.assing(val)),但它也不起作用。
它是我图表的组成部分吗?或者顺便说一句,我正在 运行宁 session?我完全迷失了这一点。
就目前而言,您的图形不要求最小化损失或更新梯度。
将更新权重的图形元素是 "self.trainer.apply_gradients(self.grads_and_vars)" 操作。我看不到您从 session.run().
中调用此 Op 的位置
尝试将其分配给一个变量并将其添加到您的 运行() 中,它应该会更新权重。
self.UpdateWeights = self.trainer.apply_gradients(self.grads_and_vars)
如果您只是将 "self.trainer" 添加到 运行() 中,除非您添加最小化 (self.loss),否则您不会更新渐变,那么您不需要 Calc/Apply渐变线.
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(self.loss)
我正在尝试编写一个 DQL 算法,我正在尝试 运行 tensorflow 上的下图
class DQN:
def __init__(self, env, n_hidden, learning_rate):
self.image_input = tf.placeholder(shape=[None, 128,128,3], dtype=tf.float32)
self.conv1 = tf.contrib.layers.convolution2d(inputs=self.image_input, num_outputs=32,
kernel_size=[8,8], stride=[4,4], padding="VALID")
self.conv2 = tf.contrib.layers.convolution2d(inputs=self.conv1, num_outputs=64,
kernel_size=[4,4], stride=[2,2], padding="VALID")
self.conv3 = tf.contrib.layers.convolution2d(inputs=self.conv2, num_outputs=64,
kernel_size=[3,3], stride=[1,1], padding="VALID")
self.conv4 = tf.contrib.layers.convolution2d(inputs=self.conv3, num_outputs=512,
kernel_size=[7,7], stride=[1,1], padding="VALID")
self.conv_out = tf.contrib.layers.flatten(self.conv4)
self.weights_1 = tf.Variable(tf.random_normal([18432, env.action_space.n], stddev=0.35), name="fully1_w")
self.bias_1 = tf.Variable(tf.zeros(env.action_space.n), name="fully1_b")
self.q_out = tf.add(tf.matmul(self.conv_out, self.weights_1), self.bias_1, name="q_out")
self.predict = tf.argmax(self.q_out, 1)
self.target_q = tf.placeholder(shape=[None],dtype=tf.float32)
self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions,env.action_space.n,dtype=tf.float32)
self.q_value = tf.reduce_sum(tf.multiply(self.q_out, self.actions_onehot), reduction_indices=1)
self.td_error = tf.square(self.target_q - self.q_value)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
self.grads_and_vars = self.trainer.compute_gradients(self.loss)
self.trainer.apply_gradients(self.grads_and_vars)
这是训练过程:
tf.reset_default_graph()
main_qf = DQN(env, n_hidden=10, learning_rate=1.0)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
target_ops = update_target_graph(trainables,tau, mode="periodically")
grads=[]
experience_buffer = ExperienceReplay(exp_size)
total_rewards = np.zeros(num_episodes)
losses = np.zeros(num_episodes)
with tf.Session() as session:
state = env.reset()
session.run(init)
update_target(target_ops, session)
for _iter in range(num_episodes):
state = env.reset()
# play ===================================================================================
done = False
img = process_image(env.render(mode="rgb_array"))
episode = []
while not done:
#e-greedy
if np.random.rand() < epsilon:
action = np.random.choice(range(env.action_space.n))
else:
feed_dict = {main_qf.image_input: img[None,:,:,:]}
action = session.run(main_qf.predict, feed_dict=feed_dict)[0]
new_state, reward, done, _ = env.step(action)
new_img = process_image(env.render(mode="rgb_array"))
experience_buffer.add((img, action, new_img,reward, done))
# update results =========================================================================
total_rewards[_iter] += reward
# Adjust params (epsilon) ===============================================================
if epsilon >= min_epsilon:
epsilon -= decay
# train ==================================================================================
prev_state, actions, new_state, rewards, is_terminal = experience_buffer.sample(batch_size)
q_function = session.run([main_qf.q_out], feed_dict={
main_qf.image_input:prev_state})
q_target = session.run([main_qf.predict], feed_dict={
main_qf.image_input:new_state})
q_target = rewards + gamma * q_target * is_terminal
loss, weights, grad = session.run([main_qf.loss,main_qf.weights_1, main_qf.grads_and_vars], feed_dict={
main_qf.image_input : prev_state,
main_qf.target_q : q_target,
main_qf.actions : actions
})
losses[_iter] = loss
update_target(target_ops, session)
但出于某种原因,我不明白训练过程没有更新网络的权重。我试图获取梯度以检查我是否有消失的梯度(得到 grads_and_vars),但事实并非如此,梯度具有很大的价值。我还尝试手动为变量赋值(通过调用 main_qf.weights1.assing(val)),但它也不起作用。
它是我图表的组成部分吗?或者顺便说一句,我正在 运行宁 session?我完全迷失了这一点。
就目前而言,您的图形不要求最小化损失或更新梯度。
将更新权重的图形元素是 "self.trainer.apply_gradients(self.grads_and_vars)" 操作。我看不到您从 session.run().
中调用此 Op 的位置尝试将其分配给一个变量并将其添加到您的 运行() 中,它应该会更新权重。
self.UpdateWeights = self.trainer.apply_gradients(self.grads_and_vars)
如果您只是将 "self.trainer" 添加到 运行() 中,除非您添加最小化 (self.loss),否则您不会更新渐变,那么您不需要 Calc/Apply渐变线.
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(self.loss)