如何为每个形状为 (169, 3) 的样本配置 Dueling Double DQN input_shape?
How to configure Dueling Double DQN input_shape for samples with a shape of (169, 3) each?
TLDR
我的 DoubleDuelingDQN 每个样本的输入形状是 (169, 3)。对于 3 个相应的动作,该 DDDQN 的输出应为 (3) 形状。目前,当我打电话给
next_qs_list = self.target_network(next_states).numpy()
..对于 batch_size=64,输出形状为 (64, 169, 3)。我的假设是,输出形状是错误的,应该是 (64, 3)。
我的 NN 当前配置如下(其中 call() 函数 return 形状错误)- 我需要如何将我的网络构建为 return 正确的形状 (3) 而不是 (169, 3)?:
class DuelingDeepQNetwork(keras.Model):
def __init__(self, n_actions, neurons_1, neurons_2, neurons_3=None):
super(DuelingDeepQNetwork, self).__init__()
self.dens_1 = keras.layers.Dense(neurons_1, activation='relu', input_dim=(169,31,)) # Here I added input_dim which is not present in my LunarLander Agent
self.dens_2 = keras.layers.Dense(neurons_2, activation='relu')
if neurons_3:
self.dens_3 = keras.layers.Dense(neurons_3, activation='relu')
self.V = keras.layers.Dense(1, activation=None) # Value layer
self.A = keras.layers.Dense(n_actions, activation=None) # Advantage layer
def call(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
V = self.V(x)
A = self.A(x)
Q = V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True))
return Q
def advantage(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
A = self.A(x)
return A
更新的错误消息:
ValueError: non-broadcastable output operand with shape (3,) doesn't match the broadcast shape (3,3)
在最后一行提出:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx]
tmp1 = self.gamma * next_qs_list[idx, max_actions[idx]]
target_qs_list[idx, actions[idx]] += tmp1 * (1-int(dones[idx]))
初始Post:
我已经(有点)完成了我的自定义 RL 环境,尊重 OpenAI Gym 概念。基本上,环境是一个 TimeSeries of OHLCV Cryptoprices 和 env.reset() returns 一个 windows 的形状 (169, 31) - 169 TimeSteps 和 31特征。使用 env.step() 代理的观察 window 在一个 TimeStep 上徘徊。我想从 3 种可能的操作开始(什么都不做/买/卖)
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.HISTORY_LENGTH+1, self.df_sample.shape[1]), dtype=np.float32)
# (169, 31)
现在我无法从 LunarLander-v2(根据 Youtube 和 Medium 上的多个教程创建)迁移现有的 DQN-Agent。我假设我的 DQNetwork and/or MemoryBuffer 格式不正确。我开始用 1,000 个随机动作样本填充我的记忆。然后训练开始并继续 agent.learn() 调用,引发以下 Error,我无法解释它是这个问题求助的原因。
TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <tf.Tensor: shape=(3,), dtype=int64, numpy=array([144, 165, 2])>
(删除过时文本)
正是这个更新循环引发了错误:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
由于我对 python+keras+TF 的调试技巧和知识到此为止,非常感谢任何帮助。
这是我的代理的代码。如果需要更多代码或信息,我会很乐意提供更多信息。
class ReplayBuffer():
def __init__(self, max_mem_size, dims):
# self.memory = max_mem_size
self.state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.action_memory = np.zeros(max_mem_size, dtype=np.int32)
self.reward_memory = np.zeros(max_mem_size, dtype=np.float32)
self.new_state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.done_memory = np.zeros(max_mem_size, dtype=np.int32)
self.max_mem_size = max_mem_size
self.mem_counter = 0
self.index = 0
def store_transition(self, transition):
'''
:param transition: Tuple of transition data (state, action, reward, new_state, done)
:return: Nothing
'''
self.state_memory[self.index] = transition[0]
self.action_memory[self.index] = transition[1]
self.reward_memory[self.index] = transition[2]
self.new_state_memory[self.index] = transition[3]
self.done_memory[self.index] = transition[4]
self.mem_counter += 1
if self.index < self.max_mem_size - 1:
self.index += 1
else:
self.index = 0
def get_sample_batch(self, batch_size, replace=False):
'''
:param batch_size: Number of samples for batch
:param replace: Wether or not double entries are allowed in returned batch
:return: Tuples of transition data (state, action, reward, new_state, done)
'''
max_size = min(self.mem_counter, self.max_mem_size)
batch_ids = np.random.default_rng().choice(max_size, batch_size, replace)
states = self.state_memory[batch_ids]
actions = self.action_memory[batch_ids]
rewards = self.reward_memory[batch_ids]
new_states = self.new_state_memory[batch_ids]
dones = self.done_memory[batch_ids]
return states, actions, rewards, new_states, dones
class DuelingDeepQNAgent():
def __init__(self, lr, gamma, env, batch_size=64, mem_size=1_000_000, update_target_every=50):
self.n_actions = env.action_space.n
self.input_dims = env.observation_space.shape #env.observation_space.shape[0]
self.action_space = [i for i in range(self.n_actions)]
self.gamma = gamma
self.epsilon = 1.0
self.batch_size = batch_size
self.memory = ReplayBuffer(max_mem_size=mem_size, dims=self.input_dims)
self.update_target_every = update_target_every
self.update_target_counter = 0
self.learn_step_counter = 0
# Main model - gets trained every single step()
self.q_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.target_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.q_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
self.target_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
def store_transition(self, transition):
self.memory.store_transition(transition)
def choose_action(self, observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation]) # Add in an extra dimension -> quasi hinzufügen einer "batch-dimension"
q_values = self.q_network.advantage(state)
action = tf.math.argmax(q_values, axis=1).numpy()[0]
return action
def learn(self):
if self.memory.mem_counter < self.batch_size:
return
if self.update_target_counter % self.update_target_every == 0:
self.target_network.set_weights(self.q_network.get_weights())
current_states, actions, rewards, next_states, dones = self.memory.get_sample_batch(self.batch_size)
current_qs_list = self.q_network(current_states)
next_qs_list = self.target_network(next_states)
target_qs_list = current_qs_list.numpy() # ??? From Tensor to Numpy?!
max_actions = tf.math.argmax(self.q_network(next_states), axis=1)
# According to Phil: improve on my solution here....
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
self.q_network.fit(current_states, target_qs_list, batch_size=self.batch_size, verbose=0)
self.learn_step_counter += 1
您不能在 2D 输入上使用 Dense 层(即 3D 输入,具有批次维度)。它只会将最后一个维度作为其输入。密集层使用一维输入张量,以二维批处理的形式提供。
要么添加 Flatten(shape=-1) 层来展平您的输入(例如,到 (batch_size, 169*3) 张量),要么使用卷积网络。
此外,您的 self.A 图层似乎为给定状态的每个动作提供 q 值。优势将是行动价值 - 价值估计。我认为这就是您在 call()
中计算的内容
如果展平不能修复你的输出形状,我会仔细检查 Q 的参数形状,以及返回的 Q 的形状。
print(Q.shape) 将是你调试的好朋友!
TLDR
我的 DoubleDuelingDQN 每个样本的输入形状是 (169, 3)。对于 3 个相应的动作,该 DDDQN 的输出应为 (3) 形状。目前,当我打电话给
next_qs_list = self.target_network(next_states).numpy()
..对于 batch_size=64,输出形状为 (64, 169, 3)。我的假设是,输出形状是错误的,应该是 (64, 3)。 我的 NN 当前配置如下(其中 call() 函数 return 形状错误)- 我需要如何将我的网络构建为 return 正确的形状 (3) 而不是 (169, 3)?:
class DuelingDeepQNetwork(keras.Model):
def __init__(self, n_actions, neurons_1, neurons_2, neurons_3=None):
super(DuelingDeepQNetwork, self).__init__()
self.dens_1 = keras.layers.Dense(neurons_1, activation='relu', input_dim=(169,31,)) # Here I added input_dim which is not present in my LunarLander Agent
self.dens_2 = keras.layers.Dense(neurons_2, activation='relu')
if neurons_3:
self.dens_3 = keras.layers.Dense(neurons_3, activation='relu')
self.V = keras.layers.Dense(1, activation=None) # Value layer
self.A = keras.layers.Dense(n_actions, activation=None) # Advantage layer
def call(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
V = self.V(x)
A = self.A(x)
Q = V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True))
return Q
def advantage(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
A = self.A(x)
return A
更新的错误消息:
ValueError: non-broadcastable output operand with shape (3,) doesn't match the broadcast shape (3,3)
在最后一行提出:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx]
tmp1 = self.gamma * next_qs_list[idx, max_actions[idx]]
target_qs_list[idx, actions[idx]] += tmp1 * (1-int(dones[idx]))
初始Post:
我已经(有点)完成了我的自定义 RL 环境,尊重 OpenAI Gym 概念。基本上,环境是一个 TimeSeries of OHLCV Cryptoprices 和 env.reset() returns 一个 windows 的形状 (169, 31) - 169 TimeSteps 和 31特征。使用 env.step() 代理的观察 window 在一个 TimeStep 上徘徊。我想从 3 种可能的操作开始(什么都不做/买/卖)
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.HISTORY_LENGTH+1, self.df_sample.shape[1]), dtype=np.float32)
# (169, 31)
现在我无法从 LunarLander-v2(根据 Youtube 和 Medium 上的多个教程创建)迁移现有的 DQN-Agent。我假设我的 DQNetwork and/or MemoryBuffer 格式不正确。我开始用 1,000 个随机动作样本填充我的记忆。然后训练开始并继续 agent.learn() 调用,引发以下 Error,我无法解释它是这个问题求助的原因。
TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <tf.Tensor: shape=(3,), dtype=int64, numpy=array([144, 165, 2])>
(删除过时文本) 正是这个更新循环引发了错误:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
由于我对 python+keras+TF 的调试技巧和知识到此为止,非常感谢任何帮助。
这是我的代理的代码。如果需要更多代码或信息,我会很乐意提供更多信息。
class ReplayBuffer():
def __init__(self, max_mem_size, dims):
# self.memory = max_mem_size
self.state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.action_memory = np.zeros(max_mem_size, dtype=np.int32)
self.reward_memory = np.zeros(max_mem_size, dtype=np.float32)
self.new_state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.done_memory = np.zeros(max_mem_size, dtype=np.int32)
self.max_mem_size = max_mem_size
self.mem_counter = 0
self.index = 0
def store_transition(self, transition):
'''
:param transition: Tuple of transition data (state, action, reward, new_state, done)
:return: Nothing
'''
self.state_memory[self.index] = transition[0]
self.action_memory[self.index] = transition[1]
self.reward_memory[self.index] = transition[2]
self.new_state_memory[self.index] = transition[3]
self.done_memory[self.index] = transition[4]
self.mem_counter += 1
if self.index < self.max_mem_size - 1:
self.index += 1
else:
self.index = 0
def get_sample_batch(self, batch_size, replace=False):
'''
:param batch_size: Number of samples for batch
:param replace: Wether or not double entries are allowed in returned batch
:return: Tuples of transition data (state, action, reward, new_state, done)
'''
max_size = min(self.mem_counter, self.max_mem_size)
batch_ids = np.random.default_rng().choice(max_size, batch_size, replace)
states = self.state_memory[batch_ids]
actions = self.action_memory[batch_ids]
rewards = self.reward_memory[batch_ids]
new_states = self.new_state_memory[batch_ids]
dones = self.done_memory[batch_ids]
return states, actions, rewards, new_states, dones
class DuelingDeepQNAgent():
def __init__(self, lr, gamma, env, batch_size=64, mem_size=1_000_000, update_target_every=50):
self.n_actions = env.action_space.n
self.input_dims = env.observation_space.shape #env.observation_space.shape[0]
self.action_space = [i for i in range(self.n_actions)]
self.gamma = gamma
self.epsilon = 1.0
self.batch_size = batch_size
self.memory = ReplayBuffer(max_mem_size=mem_size, dims=self.input_dims)
self.update_target_every = update_target_every
self.update_target_counter = 0
self.learn_step_counter = 0
# Main model - gets trained every single step()
self.q_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.target_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.q_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
self.target_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
def store_transition(self, transition):
self.memory.store_transition(transition)
def choose_action(self, observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation]) # Add in an extra dimension -> quasi hinzufügen einer "batch-dimension"
q_values = self.q_network.advantage(state)
action = tf.math.argmax(q_values, axis=1).numpy()[0]
return action
def learn(self):
if self.memory.mem_counter < self.batch_size:
return
if self.update_target_counter % self.update_target_every == 0:
self.target_network.set_weights(self.q_network.get_weights())
current_states, actions, rewards, next_states, dones = self.memory.get_sample_batch(self.batch_size)
current_qs_list = self.q_network(current_states)
next_qs_list = self.target_network(next_states)
target_qs_list = current_qs_list.numpy() # ??? From Tensor to Numpy?!
max_actions = tf.math.argmax(self.q_network(next_states), axis=1)
# According to Phil: improve on my solution here....
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
self.q_network.fit(current_states, target_qs_list, batch_size=self.batch_size, verbose=0)
self.learn_step_counter += 1
您不能在 2D 输入上使用 Dense 层(即 3D 输入,具有批次维度)。它只会将最后一个维度作为其输入。密集层使用一维输入张量,以二维批处理的形式提供。
要么添加 Flatten(shape=-1) 层来展平您的输入(例如,到 (batch_size, 169*3) 张量),要么使用卷积网络。
此外,您的 self.A 图层似乎为给定状态的每个动作提供 q 值。优势将是行动价值 - 价值估计。我认为这就是您在 call()
中计算的内容如果展平不能修复你的输出形状,我会仔细检查 Q 的参数形状,以及返回的 Q 的形状。 print(Q.shape) 将是你调试的好朋友!