使用 .detach() 的 Pytorch DQN、DDQN 导致非常大的损失(呈指数增长)并且根本不学习
Pytorch DQN, DDQN using .detach() caused very wield loss (increases exponentially) and do not learn at all
这是我为 CartPole-v0 实现的 DQN 和 DDQN,我认为是正确的。
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import torch.optim as optim
import random
import os
import time
class NETWORK(torch.nn.Module):
def __init__(self, input_dim: int, output_dim: int, hidden_dim: int) -> None:
super(NETWORK, self).__init__()
self.layer1 = torch.nn.Sequential(
torch.nn.Linear(input_dim, hidden_dim),
torch.nn.ReLU()
)
self.layer2 = torch.nn.Sequential(
torch.nn.Linear(hidden_dim, hidden_dim),
torch.nn.ReLU()
)
self.final = torch.nn.Linear(hidden_dim, output_dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.layer1(x)
x = self.layer2(x)
x = self.final(x)
return x
class ReplayBuffer(object):
def __init__(self, capacity=50000):
self.capacity = capacity
self.memory = []
self.position = 0
def push(self, s0, a0, r, s1):
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = (s0, a0, r, s1)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size=64):
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class DQN(object):
def __init__(self):
self.state_dim = 4
self.action_dim = 2
self.lr = 0.001
self.discount_factor = 0.99
self.epsilon = 1
self.epsilon_decay = 0.95
self.num_train = 0
self.num_train_episodes = 0
self.batch_size = 64
self.predict_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.memory = ReplayBuffer(capacity=50000)
self.optimizer = torch.optim.Adam(self.predict_network.parameters(), lr=self.lr)
self.loss = 0
def select_action(self, states: np.ndarray) -> int:
if np.random.uniform(0, 1) < self.epsilon:
return np.random.choice(self.action_dim)
else:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def policy(self, states: np.ndarray) -> int:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def train(self, s0, a0, r, s1, sign):
if sign == 1:
self.num_train_episodes += 1
if self.epsilon > 0.01:
self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
return
self.num_train += 1
self.memory.push(s0, a0, r, s1)
if len(self.memory) < self.batch_size:
return
batch = self.memory.sample(self.batch_size)
state_batch = torch.from_numpy(np.stack([b[0] for b in batch]))
action_batch = torch.from_numpy(np.stack([b[1] for b in batch]))
reward_batch = torch.from_numpy(np.stack([b[2] for b in batch]))
next_state_batch = torch.from_numpy(np.stack([b[3] for b in batch]))
Q_values = self.predict_network(state_batch)[torch.arange(self.batch_size), action_batch]
next_state_Q_values = self.predict_network(next_state_batch).max(dim=1)[0]
Q_targets = self.discount_factor * next_state_Q_values + reward_batch
loss = F.mse_loss(Q_values, Q_targets.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.loss = loss.data.item()
class DDQN(object):
def __init__(self):
self.state_dim = 4
self.action_dim = 2
self.lr = 0.001
self.discount_factor = 0.9
self.epsilon = 1
self.epsilon_decay = 0.95
self.num_train = 0
self.num_train_episodes = 0
self.batch_size = 64
self.predict_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.target_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.target_network.load_state_dict(self.predict_network.state_dict())
self.target_network.eval()
self.memory = ReplayBuffer(capacity=50000)
self.optimizer = torch.optim.Adam(self.predict_network.parameters(), lr=self.lr)
self.loss = 0
def select_action(self, states: np.ndarray) -> int:
if np.random.uniform(0, 1) < self.epsilon:
return np.random.choice(self.action_dim)
else:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def policy(self, states: np.ndarray) -> int:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def train(self, s0, a0, r, s1, sign):
if sign == 1:
self.num_train_episodes += 1
if self.num_train_episodes % 2 == 0:
self.target_network.load_state_dict(self.predict_network.state_dict())
self.target_network.eval()
if self.epsilon > 0.01:
self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
return
self.num_train += 1
self.memory.push(s0, a0, r, s1)
if len(self.memory) < self.batch_size:
return
batch = self.memory.sample(self.batch_size)
state_batch = torch.from_numpy(np.stack([b[0] for b in batch]))
action_batch = torch.from_numpy(np.stack([b[1] for b in batch]))
reward_batch = torch.from_numpy(np.stack([b[2] for b in batch]))
next_state_batch = torch.from_numpy(np.stack([b[3] for b in batch]))
Q_values = self.predict_network(state_batch)[torch.arange(self.batch_size), action_batch]
next_state_action_batch = torch.argmax(self.predict_network(next_state_batch), dim=1)
next_state_Q_values = self.target_network(next_state_batch)[torch.arange(self.batch_size), next_state_action_batch]
Q_targets = self.discount_factor * next_state_Q_values + reward_batch
loss = F.smooth_l1_loss(Q_values, Q_targets.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.loss = loss.data.item()
我使用以下方法来评估和训练我的 DQN 和 DDQN。
def eval_policy(agent, env_name, eval_episodes=10):
eval_env = gym.make(env_name)
avg_reward = 0.
for _ in range(eval_episodes):
state, done = eval_env.reset(), False
while not done:
action = agent.policy(state)
state, reward, done, _ = eval_env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print("---------------------------------------")
print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
print("---------------------------------------")
return avg_reward
env_name = 'CartPole-v0'
env = gym.make(env_name)
agent = DQN() # agent = DDQN()
for i in range(1000):
state, done = env.reset(), False
episodic_reward = 0
while not done:
action = agent.select_action(np.squeeze(state))
next_state, reward, done, info = env.step(action)
episodic_reward += reward
sign = 1 if done else 0
agent.train(state, action, reward, next_state, sign)
state = next_state
print(f'episode: {i}, reward: {episodic_reward}')
if i % 20 == 0:
eval_reward = eval_policy(agent, env_name, eval_episodes=50)
if eval_reward >= 195:
print("Problem solved in {} episodes".format(i + 1))
break
问题是我的 DQN 网络没有训练,并且在损失计算中使用 target.detach() 损失呈指数增长。如果我不使用 .detach(),DQN 对象将进行训练,但我认为这不是正确的方法。对于 DDQN,我的网络总是不训练。任何人都可以就可能出错的地方提供一些建议吗?
所以您实施中的一个错误是您从未将剧集的结尾添加到重播缓冲区。在你的火车功能中,你 return if sign==1 (剧集结束)。删除 return 并通过 (1-dones)*... 调整目标计算,以防您对剧集结尾的过渡进行采样。剧集结尾之所以重要,是因为它是唯一的体验,目标不是通过自举逼近。然后DQN训练。为了再现性,我使用了 0.99 的折扣率和 2020 种子(用于 torch、numpy 和健身房环境)。训练241期后获得奖励199.100
希望对您有所帮助,顺便说一句,代码非常易读。
这是我为 CartPole-v0 实现的 DQN 和 DDQN,我认为是正确的。
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import torch.optim as optim
import random
import os
import time
class NETWORK(torch.nn.Module):
def __init__(self, input_dim: int, output_dim: int, hidden_dim: int) -> None:
super(NETWORK, self).__init__()
self.layer1 = torch.nn.Sequential(
torch.nn.Linear(input_dim, hidden_dim),
torch.nn.ReLU()
)
self.layer2 = torch.nn.Sequential(
torch.nn.Linear(hidden_dim, hidden_dim),
torch.nn.ReLU()
)
self.final = torch.nn.Linear(hidden_dim, output_dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.layer1(x)
x = self.layer2(x)
x = self.final(x)
return x
class ReplayBuffer(object):
def __init__(self, capacity=50000):
self.capacity = capacity
self.memory = []
self.position = 0
def push(self, s0, a0, r, s1):
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = (s0, a0, r, s1)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size=64):
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class DQN(object):
def __init__(self):
self.state_dim = 4
self.action_dim = 2
self.lr = 0.001
self.discount_factor = 0.99
self.epsilon = 1
self.epsilon_decay = 0.95
self.num_train = 0
self.num_train_episodes = 0
self.batch_size = 64
self.predict_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.memory = ReplayBuffer(capacity=50000)
self.optimizer = torch.optim.Adam(self.predict_network.parameters(), lr=self.lr)
self.loss = 0
def select_action(self, states: np.ndarray) -> int:
if np.random.uniform(0, 1) < self.epsilon:
return np.random.choice(self.action_dim)
else:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def policy(self, states: np.ndarray) -> int:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def train(self, s0, a0, r, s1, sign):
if sign == 1:
self.num_train_episodes += 1
if self.epsilon > 0.01:
self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
return
self.num_train += 1
self.memory.push(s0, a0, r, s1)
if len(self.memory) < self.batch_size:
return
batch = self.memory.sample(self.batch_size)
state_batch = torch.from_numpy(np.stack([b[0] for b in batch]))
action_batch = torch.from_numpy(np.stack([b[1] for b in batch]))
reward_batch = torch.from_numpy(np.stack([b[2] for b in batch]))
next_state_batch = torch.from_numpy(np.stack([b[3] for b in batch]))
Q_values = self.predict_network(state_batch)[torch.arange(self.batch_size), action_batch]
next_state_Q_values = self.predict_network(next_state_batch).max(dim=1)[0]
Q_targets = self.discount_factor * next_state_Q_values + reward_batch
loss = F.mse_loss(Q_values, Q_targets.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.loss = loss.data.item()
class DDQN(object):
def __init__(self):
self.state_dim = 4
self.action_dim = 2
self.lr = 0.001
self.discount_factor = 0.9
self.epsilon = 1
self.epsilon_decay = 0.95
self.num_train = 0
self.num_train_episodes = 0
self.batch_size = 64
self.predict_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.target_network = NETWORK(input_dim=4, output_dim=2, hidden_dim=16).double()
self.target_network.load_state_dict(self.predict_network.state_dict())
self.target_network.eval()
self.memory = ReplayBuffer(capacity=50000)
self.optimizer = torch.optim.Adam(self.predict_network.parameters(), lr=self.lr)
self.loss = 0
def select_action(self, states: np.ndarray) -> int:
if np.random.uniform(0, 1) < self.epsilon:
return np.random.choice(self.action_dim)
else:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def policy(self, states: np.ndarray) -> int:
states = torch.from_numpy(states).unsqueeze_(dim=0)
with torch.no_grad():
Q_values = self.predict_network(states)
action = torch.argmax(Q_values).item()
return action
def train(self, s0, a0, r, s1, sign):
if sign == 1:
self.num_train_episodes += 1
if self.num_train_episodes % 2 == 0:
self.target_network.load_state_dict(self.predict_network.state_dict())
self.target_network.eval()
if self.epsilon > 0.01:
self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
return
self.num_train += 1
self.memory.push(s0, a0, r, s1)
if len(self.memory) < self.batch_size:
return
batch = self.memory.sample(self.batch_size)
state_batch = torch.from_numpy(np.stack([b[0] for b in batch]))
action_batch = torch.from_numpy(np.stack([b[1] for b in batch]))
reward_batch = torch.from_numpy(np.stack([b[2] for b in batch]))
next_state_batch = torch.from_numpy(np.stack([b[3] for b in batch]))
Q_values = self.predict_network(state_batch)[torch.arange(self.batch_size), action_batch]
next_state_action_batch = torch.argmax(self.predict_network(next_state_batch), dim=1)
next_state_Q_values = self.target_network(next_state_batch)[torch.arange(self.batch_size), next_state_action_batch]
Q_targets = self.discount_factor * next_state_Q_values + reward_batch
loss = F.smooth_l1_loss(Q_values, Q_targets.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.loss = loss.data.item()
我使用以下方法来评估和训练我的 DQN 和 DDQN。
def eval_policy(agent, env_name, eval_episodes=10):
eval_env = gym.make(env_name)
avg_reward = 0.
for _ in range(eval_episodes):
state, done = eval_env.reset(), False
while not done:
action = agent.policy(state)
state, reward, done, _ = eval_env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print("---------------------------------------")
print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
print("---------------------------------------")
return avg_reward
env_name = 'CartPole-v0'
env = gym.make(env_name)
agent = DQN() # agent = DDQN()
for i in range(1000):
state, done = env.reset(), False
episodic_reward = 0
while not done:
action = agent.select_action(np.squeeze(state))
next_state, reward, done, info = env.step(action)
episodic_reward += reward
sign = 1 if done else 0
agent.train(state, action, reward, next_state, sign)
state = next_state
print(f'episode: {i}, reward: {episodic_reward}')
if i % 20 == 0:
eval_reward = eval_policy(agent, env_name, eval_episodes=50)
if eval_reward >= 195:
print("Problem solved in {} episodes".format(i + 1))
break
问题是我的 DQN 网络没有训练,并且在损失计算中使用 target.detach() 损失呈指数增长。如果我不使用 .detach(),DQN 对象将进行训练,但我认为这不是正确的方法。对于 DDQN,我的网络总是不训练。任何人都可以就可能出错的地方提供一些建议吗?
所以您实施中的一个错误是您从未将剧集的结尾添加到重播缓冲区。在你的火车功能中,你 return if sign==1 (剧集结束)。删除 return 并通过 (1-dones)*... 调整目标计算,以防您对剧集结尾的过渡进行采样。剧集结尾之所以重要,是因为它是唯一的体验,目标不是通过自举逼近。然后DQN训练。为了再现性,我使用了 0.99 的折扣率和 2020 种子(用于 torch、numpy 和健身房环境)。训练241期后获得奖励199.100
希望对您有所帮助,顺便说一句,代码非常易读。