Open AI Gym Cartpole 的策略梯度方法
Policy gradient methods for Open AI Gym Cartpole
我是强化学习的初学者,正在尝试使用 Tensorflow 实施策略梯度方法来解决 Open AI Gym CartPole 任务。但是,我的代码似乎 运行 非常慢;第一集运行速度还可以,而从第二集开始就很慢了。为什么会这样,我该如何解决这个问题?
我的代码:
import tensorflow as tf
import numpy as np
import gym
env = gym.make('CartPole-v0')
class Policy:
def __init__(self):
self.input_layer_fake = tf.placeholder(tf.float32, [4,1])
self.input_layer = tf.reshape(self.input_layer_fake, [1,4])
self.dense1 = tf.layers.dense(inputs = self.input_layer, units = 4,
activation = tf.nn.relu)
self.logits = tf.layers.dense(inputs = self.dense1, units = 2,
activation = tf.nn.relu)
def predict(self, inputObservation):
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
x = tf.reshape(inputObservation, [4,1]).eval()
return (sess.run(self.logits, feed_dict = {self.input_layer_fake: x}))
def train(self, features_array, labels_array):
for i in range(np.shape(features_array)[0]):
print("train")
print(i)
sess1 = tf.InteractiveSession()
tf.global_variables_initializer().run()
self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels_array[i], logits = self.logits))
self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(self.cross_entropy)
y = tf.reshape(features_array[i], [4,1]).eval()
sess1.run(self.train_step, feed_dict={self.input_layer_fake:y})
agent = Policy()
train_array = []
features_array = []
labels_array = []
main_sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
for i_episode in range(100):
observation = env.reset()
for t in range(200):
prevObservation = observation
env.render()
if np.random.uniform(0,1) < 0.2:
action = env.action_space.sample()
else:
action = np.argmax(agent.predict((prevObservation)))
observation, reward, done, info = env.step(action)
add_in = np.random.uniform(0,1)
if add_in < 0.5:
features_array.append(prevObservation)
sarPreprocessed = agent.predict(prevObservation)
sarPreprocessed[0][action] = reward
labels_array.append(sarPreprocessed)
if done:
break
agent.train(features_array, labels_array)
features_array = []
labels_array = []
非常感谢任何帮助。
我已经有一段时间没看过这种实施策略梯度的尝试了,但据我所知,问题是我在训练函数中使用了循环。
当我遍历 features_array
中的每个元素时,数组本身的长度不断增长(features_array
永远不会设置回 []
),程序慢下来。相反,我应该以 'batched' 的方式进行训练,同时定期清理 features_array
。
我在这里实现了一个更简洁的 vanilla 策略梯度算法版本:
https://github.com/Ashboy64/rl-reimplementations/blob/master/Reimplementations/Vanilla-Policy-Gradient/vanilla_pg.py
可以在此处找到称为 PPO(近端策略优化)的性能更好的改进算法(仍然基于策略梯度)的实现:
https://github.com/Ashboy64/rl-reimplementations/tree/master/Reimplementations/PPO
我是强化学习的初学者,正在尝试使用 Tensorflow 实施策略梯度方法来解决 Open AI Gym CartPole 任务。但是,我的代码似乎 运行 非常慢;第一集运行速度还可以,而从第二集开始就很慢了。为什么会这样,我该如何解决这个问题?
我的代码:
import tensorflow as tf
import numpy as np
import gym
env = gym.make('CartPole-v0')
class Policy:
def __init__(self):
self.input_layer_fake = tf.placeholder(tf.float32, [4,1])
self.input_layer = tf.reshape(self.input_layer_fake, [1,4])
self.dense1 = tf.layers.dense(inputs = self.input_layer, units = 4,
activation = tf.nn.relu)
self.logits = tf.layers.dense(inputs = self.dense1, units = 2,
activation = tf.nn.relu)
def predict(self, inputObservation):
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
x = tf.reshape(inputObservation, [4,1]).eval()
return (sess.run(self.logits, feed_dict = {self.input_layer_fake: x}))
def train(self, features_array, labels_array):
for i in range(np.shape(features_array)[0]):
print("train")
print(i)
sess1 = tf.InteractiveSession()
tf.global_variables_initializer().run()
self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels_array[i], logits = self.logits))
self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(self.cross_entropy)
y = tf.reshape(features_array[i], [4,1]).eval()
sess1.run(self.train_step, feed_dict={self.input_layer_fake:y})
agent = Policy()
train_array = []
features_array = []
labels_array = []
main_sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
for i_episode in range(100):
observation = env.reset()
for t in range(200):
prevObservation = observation
env.render()
if np.random.uniform(0,1) < 0.2:
action = env.action_space.sample()
else:
action = np.argmax(agent.predict((prevObservation)))
observation, reward, done, info = env.step(action)
add_in = np.random.uniform(0,1)
if add_in < 0.5:
features_array.append(prevObservation)
sarPreprocessed = agent.predict(prevObservation)
sarPreprocessed[0][action] = reward
labels_array.append(sarPreprocessed)
if done:
break
agent.train(features_array, labels_array)
features_array = []
labels_array = []
非常感谢任何帮助。
我已经有一段时间没看过这种实施策略梯度的尝试了,但据我所知,问题是我在训练函数中使用了循环。
当我遍历 features_array
中的每个元素时,数组本身的长度不断增长(features_array
永远不会设置回 []
),程序慢下来。相反,我应该以 'batched' 的方式进行训练,同时定期清理 features_array
。
我在这里实现了一个更简洁的 vanilla 策略梯度算法版本: https://github.com/Ashboy64/rl-reimplementations/blob/master/Reimplementations/Vanilla-Policy-Gradient/vanilla_pg.py
可以在此处找到称为 PPO(近端策略优化)的性能更好的改进算法(仍然基于策略梯度)的实现: https://github.com/Ashboy64/rl-reimplementations/tree/master/Reimplementations/PPO