在 openai cartpole 上训练一个 tensorflow 模型
training a tensorflow model on openai cartpole
我正在使用我正在实施的 tensorflow 实施我的第一个强化深度学习模型 cartpole problem。
我求助于一个使用六层的深度神经网络,该网络在随机生成的数据集上进行训练,该数据集的得分高于阈值。问题是模型没有收敛,最终得分平均保持在 10 分左右。
按照阅读某些帖子后的建议,我应用了正则化和 dropout 来减少可能发生的任何过度拟合,但仍然没有成功。我也试过降低学习率。
在训练一批之后,准确率也保持在 0.60 左右,尽管在每次迭代中损失都在减少,我认为即使在这些之后它也会记住。
尽管这种模型适用于简单的深度学习任务。
这是我的代码:
import numpy as np
import tensorflow as tf
import gym
import os
import random
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
model_path = "C:/Users/sanka/codes/cart pole problem/tf_save3"
env = gym.make("CartPole-v0")
env.reset()
def train_set(): #training set generation function
try:
tx = np.load("final_trainx.npy")
ty = np.load("final_trainy.npy")
return tx,ty
except:
tx = []
ty = []
for _ in range(10000):
env.reset()
score = 0
moves = []
obs = []
p = []
for _ in range(500):
action = np.random.randint(0, 2)
observation, reward, done, info = env.step(action)
if (len(p)==0):
p = observation
else:
moves += [action]
obs += [observation]
p = observation
score += reward
if done:
break
if (score > 50):
tx+=obs
for i in range(len(moves)):
ac = moves[i]
if (ac == 1):
ty.append([0, 1])
else:
ty.append([1, 0])
tx=np.array(tx)
ty=np.array(ty)
np.save("final_trainx.npy",tx)
np.save("final_trainy.npy",ty)
return tx, ty
weights = {
1: tf.Variable(tf.truncated_normal([4, 128]), dtype=tf.float32),
2: tf.Variable(tf.truncated_normal([128, 256]), dtype=tf.float32),
3: tf.Variable(tf.truncated_normal([256, 512]), dtype=tf.float32),
4: tf.Variable(tf.truncated_normal([512, 256]), dtype=tf.float32),
5: tf.Variable(tf.truncated_normal([256, 128]), dtype=tf.float32),
6: tf.Variable(tf.truncated_normal([128, 2]), dtype=tf.float32)
}
biases = {
1: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
2: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
3: tf.Variable(tf.truncated_normal([512]), dtype=tf.float32),
4: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
5: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
6: tf.Variable(tf.truncated_normal([2]), dtype=tf.float32)
}
def neural_network(x):
x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
x = tf.nn.dropout(x, 0.8)
x = tf.add(tf.matmul(x, weights[6]), biases[6])
return x
def test_nn(x):
x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
x = tf.nn.softmax(tf.add(tf.matmul(x, weights[6]), biases[6]))
return x
def train_nn():
prediction = neural_network(x)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
lo=tf.nn.l2_loss(weights[1])+tf.nn.l2_loss(weights[2])+tf.nn.l2_loss(weights[3])+tf.nn.l2_loss(weights[4])+tf.nn.l2_loss(weights[5])+tf.nn.l2_loss(weights[6])
loss=tf.reduce_mean(loss+0.01*lo)
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
test_pred = test_nn(x)
correct = tf.equal(tf.argmax(test_pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
epoches = 5
batch_size = 100
for j in range(epoches):
ep_loss=0
for i in range(0,len(train_x),batch_size):
epoch_x=train_x[i:min(i+batch_size,len(train_x))]
epoch_y = train_y[i:min(i + batch_size, len(train_y))]
_,c=sess.run([optimizer,loss],feed_dict={x:epoch_x,y:epoch_y})
ep_loss+=c
#print("Accuracy is {0}".format(sess.run(accuracy, feed_dict={x: epoch_x, y: epoch_y})))
print("epoch {0} completed out of {1} with loss {2}".format(j,epoches,ep_loss))
print("Accuracy is {0}".format(sess.run(accuracy,feed_dict={x:train_x,y:train_y})))
scores = []
choices = []
for each_game in range(10):
print("game ", each_game)
score = 0
game_memory = []
prev_obs = []
env.reset()
for _ in range(500):
env.render()
if (len(prev_obs) == 0):
action = random.randrange(0, 2)
else:
x1 = np.array([prev_obs]).reshape(-1,4)
a = tf.argmax(test_pred, 1)
action = sess.run(a, feed_dict={x: x1})
action=action[0]
choices.append(action)
new_observation, reward, done, info = env.step(action)
prev_obs = new_observation
game_memory.append([new_observation, action])
score += reward
if done:
break
scores.append(score)
print('Average Score:', sum(scores) / len(scores))
print('choice 1:{} choice 0:{}'.format(choices.count(1) / len(choices), choices.count(0) / len(choices)))
train_x,train_y=train_set()
print(train_x.shape)
print(train_y.shape)
x=tf.placeholder(tf.float32,[None,4])
y=tf.placeholder(tf.int32,[None,2])
train_nn()
所以您首先收集了或多或少表现良好的随机试验示例,然后根据这些示例训练您的模型?
实际上并不是强化学习。你假设随机代理采取的行动是好的,并且正在学习模仿它。所以如果你考虑一下,你的模型实际上有 60% 的时间预测随机代理的行为。考虑到这些行为是随机的,而且你的 50% 以上,你实际上很富裕。
您只能达到 50% 以上,因为您只选择了 偶然 超过 50 分的随机游戏,因此它是游戏的非随机子集。 如果你提高门槛,只考虑获得超过 100 分的随机游戏 或类似的东西,你应该会得到更好的结果。通过这种方式,您将 select 好游戏多于坏游戏。
如果您想以更强化学习的方式解决问题,即边玩边学习,而不是从别人的游戏中学习。我建议你看看 Q-Learning 或 Policy Learning。
要牢记的主要事情是,通常没有 正确的 操作可供采取。也许不同的行为会导致相同的结果。因此,与其尝试预测给定状态下哪个动作是正确的,不如尝试预测给定状态下动作的预期结果。然后选择具有最佳预期结果的操作。
我正在使用我正在实施的 tensorflow 实施我的第一个强化深度学习模型 cartpole problem。
我求助于一个使用六层的深度神经网络,该网络在随机生成的数据集上进行训练,该数据集的得分高于阈值。问题是模型没有收敛,最终得分平均保持在 10 分左右。
按照阅读某些帖子后的建议,我应用了正则化和 dropout 来减少可能发生的任何过度拟合,但仍然没有成功。我也试过降低学习率。
在训练一批之后,准确率也保持在 0.60 左右,尽管在每次迭代中损失都在减少,我认为即使在这些之后它也会记住。 尽管这种模型适用于简单的深度学习任务。
这是我的代码:
import numpy as np
import tensorflow as tf
import gym
import os
import random
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
model_path = "C:/Users/sanka/codes/cart pole problem/tf_save3"
env = gym.make("CartPole-v0")
env.reset()
def train_set(): #training set generation function
try:
tx = np.load("final_trainx.npy")
ty = np.load("final_trainy.npy")
return tx,ty
except:
tx = []
ty = []
for _ in range(10000):
env.reset()
score = 0
moves = []
obs = []
p = []
for _ in range(500):
action = np.random.randint(0, 2)
observation, reward, done, info = env.step(action)
if (len(p)==0):
p = observation
else:
moves += [action]
obs += [observation]
p = observation
score += reward
if done:
break
if (score > 50):
tx+=obs
for i in range(len(moves)):
ac = moves[i]
if (ac == 1):
ty.append([0, 1])
else:
ty.append([1, 0])
tx=np.array(tx)
ty=np.array(ty)
np.save("final_trainx.npy",tx)
np.save("final_trainy.npy",ty)
return tx, ty
weights = {
1: tf.Variable(tf.truncated_normal([4, 128]), dtype=tf.float32),
2: tf.Variable(tf.truncated_normal([128, 256]), dtype=tf.float32),
3: tf.Variable(tf.truncated_normal([256, 512]), dtype=tf.float32),
4: tf.Variable(tf.truncated_normal([512, 256]), dtype=tf.float32),
5: tf.Variable(tf.truncated_normal([256, 128]), dtype=tf.float32),
6: tf.Variable(tf.truncated_normal([128, 2]), dtype=tf.float32)
}
biases = {
1: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
2: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
3: tf.Variable(tf.truncated_normal([512]), dtype=tf.float32),
4: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
5: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
6: tf.Variable(tf.truncated_normal([2]), dtype=tf.float32)
}
def neural_network(x):
x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
x = tf.nn.dropout(x, 0.8)
x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
x = tf.nn.dropout(x, 0.8)
x = tf.add(tf.matmul(x, weights[6]), biases[6])
return x
def test_nn(x):
x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
x = tf.nn.softmax(tf.add(tf.matmul(x, weights[6]), biases[6]))
return x
def train_nn():
prediction = neural_network(x)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
lo=tf.nn.l2_loss(weights[1])+tf.nn.l2_loss(weights[2])+tf.nn.l2_loss(weights[3])+tf.nn.l2_loss(weights[4])+tf.nn.l2_loss(weights[5])+tf.nn.l2_loss(weights[6])
loss=tf.reduce_mean(loss+0.01*lo)
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
test_pred = test_nn(x)
correct = tf.equal(tf.argmax(test_pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
epoches = 5
batch_size = 100
for j in range(epoches):
ep_loss=0
for i in range(0,len(train_x),batch_size):
epoch_x=train_x[i:min(i+batch_size,len(train_x))]
epoch_y = train_y[i:min(i + batch_size, len(train_y))]
_,c=sess.run([optimizer,loss],feed_dict={x:epoch_x,y:epoch_y})
ep_loss+=c
#print("Accuracy is {0}".format(sess.run(accuracy, feed_dict={x: epoch_x, y: epoch_y})))
print("epoch {0} completed out of {1} with loss {2}".format(j,epoches,ep_loss))
print("Accuracy is {0}".format(sess.run(accuracy,feed_dict={x:train_x,y:train_y})))
scores = []
choices = []
for each_game in range(10):
print("game ", each_game)
score = 0
game_memory = []
prev_obs = []
env.reset()
for _ in range(500):
env.render()
if (len(prev_obs) == 0):
action = random.randrange(0, 2)
else:
x1 = np.array([prev_obs]).reshape(-1,4)
a = tf.argmax(test_pred, 1)
action = sess.run(a, feed_dict={x: x1})
action=action[0]
choices.append(action)
new_observation, reward, done, info = env.step(action)
prev_obs = new_observation
game_memory.append([new_observation, action])
score += reward
if done:
break
scores.append(score)
print('Average Score:', sum(scores) / len(scores))
print('choice 1:{} choice 0:{}'.format(choices.count(1) / len(choices), choices.count(0) / len(choices)))
train_x,train_y=train_set()
print(train_x.shape)
print(train_y.shape)
x=tf.placeholder(tf.float32,[None,4])
y=tf.placeholder(tf.int32,[None,2])
train_nn()
所以您首先收集了或多或少表现良好的随机试验示例,然后根据这些示例训练您的模型?
实际上并不是强化学习。你假设随机代理采取的行动是好的,并且正在学习模仿它。所以如果你考虑一下,你的模型实际上有 60% 的时间预测随机代理的行为。考虑到这些行为是随机的,而且你的 50% 以上,你实际上很富裕。
您只能达到 50% 以上,因为您只选择了 偶然 超过 50 分的随机游戏,因此它是游戏的非随机子集。 如果你提高门槛,只考虑获得超过 100 分的随机游戏 或类似的东西,你应该会得到更好的结果。通过这种方式,您将 select 好游戏多于坏游戏。
如果您想以更强化学习的方式解决问题,即边玩边学习,而不是从别人的游戏中学习。我建议你看看 Q-Learning 或 Policy Learning。
要牢记的主要事情是,通常没有 正确的 操作可供采取。也许不同的行为会导致相同的结果。因此,与其尝试预测给定状态下哪个动作是正确的,不如尝试预测给定状态下动作的预期结果。然后选择具有最佳预期结果的操作。