Tensorflow DQN 无法解决 OpenAI Cartpole
Tensorflow DQN can't solve OpenAI Cartpole
几个月来一直在学习tensorflow和rl,这几天一直在努力解决OpenAI Cartpole 用我自己的代码,但我的 Deep Q-Network 似乎无法解决它。我已经检查并将我的代码与其他实现进行比较,但我看不出我哪里出错了?任何人都可以查看我的实现并 告诉我我搞砸了什么吗? 这意味着很多,谢谢。
我的代码:
import gym
import numpy as np
import tensorflow as tf
import math
import keras
import random
class cartpole:
def __init__(self, sess, env):
self.env = env
self.state_size = env.observation_space.shape[0]
self.num_actions = env.action_space.n
self.sess = sess
self.epsilon = 1.0
self.return_loss = 0.0
self.memory = []
self.gamma = .95
self.q_model()
init = tf.global_variables_initializer()
self.sess.run(init)
def q_model(self):
self.state_input = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32)
self.reward_labels = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.hiddenlayer1_weights = tf.Variable(tf.random_normal([self.state_size, 32]))
self.hiddenlayer1_bias = tf.Variable(tf.random_normal([32]))
self.hiddenlayer1_output = tf.matmul(self.state_input, self.hiddenlayer1_weights) + self.hiddenlayer1_bias
self.hiddenlayer1_output = tf.nn.relu(self.hiddenlayer1_output)
self.hiddenlayer2_weights = tf.Variable(tf.random_normal([32, 16]))
self.hiddenlayer2_bias = tf.Variable(tf.random_normal([16]))
self.hiddenlayer2_output = tf.matmul(self.hiddenlayer1_output, self.hiddenlayer2_weights) + self.hiddenlayer2_bias
self.hiddenlayer2_output = tf.nn.relu(self.hiddenlayer2_output)
self.q_weights = tf.Variable(tf.random_normal([16, self.num_actions]))
self.q_bias = tf.Variable(tf.random_normal([self.num_actions]))
self.q_output = tf.matmul(self.hiddenlayer2_output, self.q_weights) + self.q_bias
self.q_output = keras.activations.linear(self.q_output)
self.max_q_value = tf.reshape(tf.reduce_max(self.q_output), (1,1))
self.best_action = tf.squeeze(tf.argmax(self.q_output, axis=1))
self.loss = tf.losses.mean_squared_error(self.max_q_value, self.reward_labels)
self.train_model = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
def predict_action(self, state):
self.epsilon *= .995 + .01
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
return action
def predict_value(self, state):
state = np.array(state).reshape((1, 4))
max_q_value = self.sess.run(self.max_q_value, feed_dict={self.state_input: state})[0][0]
return max_q_value
def train_q_model(self, state, reward):
q_values, _, loss = self.sess.run([self.q_output, self.train_model, self.loss], feed_dict={self.state_input: state, self.reward_labels: reward})
self.return_loss = loss
def get_loss(self):
return self.return_loss
def experience_replay(self):
if len(self.memory) < 33:
return
del self.memory[0]
batch = random.sample(self.memory, 32)
for state, action, reward, new_state, done in self.memory:
reward = reward if not done else - reward
new_state = np.array(new_state).reshape((1, 4))
if not done:
reward = reward + (self.gamma * self.predict_value(new_state))
reward = np.array(reward).reshape((1, 1))
self.train_q_model(state, reward)
env = gym.make("CartPole-v0")
sess = tf.Session()
A2C = cartpole(sess, env)
episodes = 2000
reward_history = []
for i in range(episodes):
state = env.reset()
reward_total = 0
while True:
state = np.array(state).reshape((1, 4))
average_best_reward = sum(reward_history[-100:]) / 100.0
if (average_best_reward) > 195:
env.render()
action = A2C.predict_action(state)
new_state, reward, done, _ = env.step(action)
reward_total += reward
A2C.memory.append([state, action, reward, new_state, done])
A2C.experience_replay()
state = new_state
if done:
if (average_best_reward >= 195):
print("Finished! Episodes taken: ", i, "average reward: ", average_best_reward)
print("average reward = ", average_best_reward, "reward total = ", reward_total, "loss = ", A2C.get_loss())
reward_history.append(reward_total)
break
您的初始 epsilon 设置为 1 self.epsilon = 1.0
。然而,当你执行一个动作时,你不会衰减它,而是会增加它。
self.epsilon *= .995 + .01
1.0 x 0.995 + 0.01 = 0.995 + 0.01 = 1.005
探索因子 (epsilon) 应该 衰减:
self.epsilon *= .995
这会导致代理永远不会使用您训练的网络,而是坚持执行随机操作。
此外,在 return action
之前更新探索因子将是最佳选择,因为您希望它也对第一个操作使用初始 epsilon。
def predict_action(self, state):
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
self.epsilon *= .995
return action
您还可以将 if len(self.memory) < 33:
更改为 if len(self.memory) < 32:
,假设您希望将 32 作为批量大小。
除了已经提到的正确点之外,您还导入了 Tensorflow 和 Keras
。但是,您唯一一次利用 Keras
是 keras.activations.linear
。您所做的其他所有事情都是直接使用 Tensorflow
完成的。
Keras 是构建在 Tensorflow 之上的框架,它简化了底层 Tensorflow 库的使用。我建议您更多地了解它们之间的差异,并尝试仅使用 Keras 来实现上述代码。虽然您已经知道如何在 TF 中执行此操作,但您会欣赏 Keras 带来的简单性。
一个简单的帮助是来自 OpenAI 排行榜的这个 Cartpole 代理:
https://gym.openai.com/evaluations/eval_GazXePIETsOvUaxmoILNHw/
几个月来一直在学习tensorflow和rl,这几天一直在努力解决OpenAI Cartpole 用我自己的代码,但我的 Deep Q-Network 似乎无法解决它。我已经检查并将我的代码与其他实现进行比较,但我看不出我哪里出错了?任何人都可以查看我的实现并 告诉我我搞砸了什么吗? 这意味着很多,谢谢。
我的代码:
import gym
import numpy as np
import tensorflow as tf
import math
import keras
import random
class cartpole:
def __init__(self, sess, env):
self.env = env
self.state_size = env.observation_space.shape[0]
self.num_actions = env.action_space.n
self.sess = sess
self.epsilon = 1.0
self.return_loss = 0.0
self.memory = []
self.gamma = .95
self.q_model()
init = tf.global_variables_initializer()
self.sess.run(init)
def q_model(self):
self.state_input = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32)
self.reward_labels = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.hiddenlayer1_weights = tf.Variable(tf.random_normal([self.state_size, 32]))
self.hiddenlayer1_bias = tf.Variable(tf.random_normal([32]))
self.hiddenlayer1_output = tf.matmul(self.state_input, self.hiddenlayer1_weights) + self.hiddenlayer1_bias
self.hiddenlayer1_output = tf.nn.relu(self.hiddenlayer1_output)
self.hiddenlayer2_weights = tf.Variable(tf.random_normal([32, 16]))
self.hiddenlayer2_bias = tf.Variable(tf.random_normal([16]))
self.hiddenlayer2_output = tf.matmul(self.hiddenlayer1_output, self.hiddenlayer2_weights) + self.hiddenlayer2_bias
self.hiddenlayer2_output = tf.nn.relu(self.hiddenlayer2_output)
self.q_weights = tf.Variable(tf.random_normal([16, self.num_actions]))
self.q_bias = tf.Variable(tf.random_normal([self.num_actions]))
self.q_output = tf.matmul(self.hiddenlayer2_output, self.q_weights) + self.q_bias
self.q_output = keras.activations.linear(self.q_output)
self.max_q_value = tf.reshape(tf.reduce_max(self.q_output), (1,1))
self.best_action = tf.squeeze(tf.argmax(self.q_output, axis=1))
self.loss = tf.losses.mean_squared_error(self.max_q_value, self.reward_labels)
self.train_model = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
def predict_action(self, state):
self.epsilon *= .995 + .01
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
return action
def predict_value(self, state):
state = np.array(state).reshape((1, 4))
max_q_value = self.sess.run(self.max_q_value, feed_dict={self.state_input: state})[0][0]
return max_q_value
def train_q_model(self, state, reward):
q_values, _, loss = self.sess.run([self.q_output, self.train_model, self.loss], feed_dict={self.state_input: state, self.reward_labels: reward})
self.return_loss = loss
def get_loss(self):
return self.return_loss
def experience_replay(self):
if len(self.memory) < 33:
return
del self.memory[0]
batch = random.sample(self.memory, 32)
for state, action, reward, new_state, done in self.memory:
reward = reward if not done else - reward
new_state = np.array(new_state).reshape((1, 4))
if not done:
reward = reward + (self.gamma * self.predict_value(new_state))
reward = np.array(reward).reshape((1, 1))
self.train_q_model(state, reward)
env = gym.make("CartPole-v0")
sess = tf.Session()
A2C = cartpole(sess, env)
episodes = 2000
reward_history = []
for i in range(episodes):
state = env.reset()
reward_total = 0
while True:
state = np.array(state).reshape((1, 4))
average_best_reward = sum(reward_history[-100:]) / 100.0
if (average_best_reward) > 195:
env.render()
action = A2C.predict_action(state)
new_state, reward, done, _ = env.step(action)
reward_total += reward
A2C.memory.append([state, action, reward, new_state, done])
A2C.experience_replay()
state = new_state
if done:
if (average_best_reward >= 195):
print("Finished! Episodes taken: ", i, "average reward: ", average_best_reward)
print("average reward = ", average_best_reward, "reward total = ", reward_total, "loss = ", A2C.get_loss())
reward_history.append(reward_total)
break
您的初始 epsilon 设置为 1 self.epsilon = 1.0
。然而,当你执行一个动作时,你不会衰减它,而是会增加它。
self.epsilon *= .995 + .01
1.0 x 0.995 + 0.01 = 0.995 + 0.01 = 1.005
探索因子 (epsilon) 应该 衰减:
self.epsilon *= .995
这会导致代理永远不会使用您训练的网络,而是坚持执行随机操作。
此外,在 return action
之前更新探索因子将是最佳选择,因为您希望它也对第一个操作使用初始 epsilon。
def predict_action(self, state):
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
self.epsilon *= .995
return action
您还可以将 if len(self.memory) < 33:
更改为 if len(self.memory) < 32:
,假设您希望将 32 作为批量大小。
除了已经提到的正确点之外,您还导入了 Tensorflow 和 Keras
。但是,您唯一一次利用 Keras
是 keras.activations.linear
。您所做的其他所有事情都是直接使用 Tensorflow
完成的。
Keras 是构建在 Tensorflow 之上的框架,它简化了底层 Tensorflow 库的使用。我建议您更多地了解它们之间的差异,并尝试仅使用 Keras 来实现上述代码。虽然您已经知道如何在 TF 中执行此操作,但您会欣赏 Keras 带来的简单性。
一个简单的帮助是来自 OpenAI 排行榜的这个 Cartpole 代理: https://gym.openai.com/evaluations/eval_GazXePIETsOvUaxmoILNHw/