使用DQN求解最短路径
using DQN to solve shortest path
我正在尝试了解 DQN 是否可以解决最短路径算法
所以我有这个 Dataframe,其中包含一个 source
,它有 nodes id
,end
代表目的地,也有 nodes id
,以及代表边缘距离的权重和然后我将数据框转换为图论如下
DataFrame
source end weight
0 688615041 208456626 15.653688122127072
1 688615041 1799221665 10.092266065922756
2 1799221657 1799221660 8.673942902872051
3 1799221660 1799221665 15.282152665774992
4 1799221660 2003461246 25.85307821157314
5 1799221660 299832604 75.99884525624508
6 299832606 2003461227 4.510148061854331
7 299832606 2003461246 10.954119220974723
8 299832606 2364408910 4.903114362426424
9 1731824802 2003461235 6.812335798968233
10 1799221677 208456626 8.308567154008992
11 208456626 2003461246 14.56512909988425
12 208456626 1250468692 16.416527267975034
13 1011881546 1250468696 12.209773608913697
14 1011881546 2003461246 7.477102764665149
15 2364408910 1130166767 9.780352545373274
16 2364408910 2003461246 6.660771089602594
17 2364408910 2003461237 3.125301826317477
18 2364408911 2003461240 3.836966849565568
19 2364408911 2003461246 6.137847950353395
20 2364408911 2003461247 7.399469477211698
21 2364408911 2003461237 3.90876793066916
22 1250468692 1250468696 8.474825189804282
23 1250468701 2003461247 4.539111170687284
24 2003461235 2003461246 12.400601105777394
25 2003461246 2003461247 12.437602668573737
图表看起来像这样
pos = nx.spring_layout(g)
edge_labels = nx.get_edge_attributes(g, 'weight')
nx.draw(g, pos, node_size=100)
nx.draw_networkx_edge_labels(g, pos, edge_labels, font_size=8)
nx.draw_networkx_labels(g, pos, font_size=10)
plt.title("Syntethic representation of the City")
plt.show()
print('Total number of Nodes: '+str(len(g.nodes)))
graph
现在我使用固定状态的DQN,从节点号1130166767作为开始,到节点号1731824802作为目标。
这是我的全部代码
class Network(nn.Module):
def __init__(self,input_dim,n_action):
super(Network,self).__init__()
self.f1=nn.Linear(input_dim,128)
self.f2=nn.Linear(128,64)
self.f3=nn.Linear(64,32)
self.f4=nn.Linear(32,n_action)
#self.optimizer=optim.Adam(self.parameters(),lr=lr)
#self.loss=nn.MSELoss()
self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,x):
x=F.relu(self.f1(x))
x=F.relu(self.f2(x))
x=F.relu(self.f3(x))
x=self.f4(x)
return x
def act(self,obs):
#state=T.tensor(obs).to(device)
state=obs.to(self.device)
actions=self.forward(state)
action=T.argmax(actions).item()
return action
device=T.device('cuda' if T.cuda.is_available() else 'cpu')
print(device)
num_states = len(g.nodes)*1
### if we need to train a specific set of nodes for ex 10 we *10
num_actions = len(g.nodes)
print("Expected number of States are: "+str(num_states))
print("Expected number of action are: "+str(num_actions))
#num_action*2=when we would like to convert the state into onehotvector we need to concatinate the two vector 22+22
online=Network(num_actions*2,num_actions)
target=Network(num_actions*2,num_actions)
target.load_state_dict(online.state_dict())
optimizer=T.optim.Adam(online.parameters(),lr=5e-4)
#create a dictionary that have encoded index for each node
#to solve this isssu
#reset()=476562122273
#number of state < 476562122273
enc_node={}
dec_node={}
for index,nd in enumerate(g.nodes):
enc_node[nd]=index
dec_node[index]=nd
def wayenc(current_node,new_node,type=1):
#encoded
if type==1: #distance
if new_node in g[current_node]:
rw=g[current_node][new_node]['weight']*-1
return rw,True
rw=-5000
return rw,False
def rw_function(current,action):
#current_node
#new_node
beta=1 #between 1 and 0
current=dec_node[current]
new_node=dec_node[action]
rw0,link=wayenc(current,new_node)
rw1=0
frw=rw0*beta+(1-beta)*rw1
return frw,link
def state_enc(dst, end,n=len(g.nodes)):
return dst+n*end
def state_dec(state,n=len(g.nodes)):
dst = state%n
end = (state-dst)/n
return dst, int(end)
def step(state,action):
done=False
current_node , end = state_dec(state)
new_state = state_enc(action,end)
rw,link=rw_function(current_node,action)
if not link:
new_state = state
return new_state,rw,False
elif action == end:
rw = 10000 #500*12
done=True
return new_state,rw,done
def reset():
state=state_enc(enc_node[1130166767],enc_node[1731824802])
return state
def state_to_vector(current_node,end_node):
n=len(g.nodes)
source_state_zeros=[0.]*n
source_state_zeros[current_node]=1
end_state_zeros=[0.]*n
end_state_zeros[end_node]=1.
vector=source_state_zeros+end_state_zeros
return vector
#return a list of list converted from state to vectors
def list_of_vecotrs(new_obses_t):
list_new_obss_t=new_obses_t.tolist()
#convert to integer
list_new_obss_t=[int(v) for v in list_new_obss_t]
vector_list=[]
for state in list_new_obss_t:
s,f=state_dec(state)
vector=state_to_vector(s,f)
vector_list.append(vector)
return vector_list
#fill the replay buffer
#replay_buffer=[]
rew_buffer=[0]
penalties=[]
episode_reward=0.0
batch_size=num_actions*2
buffer_size=100000
min_replay_size=int(buffer_size*0.20)
target_update_freq=1000
flag=0
action_list=np.arange(0,len(g.nodes)).tolist()
replay_buffer=deque(maxlen=buffer_size)
#populate the experience network
obs=reset()
#obs,end=state_dec(start,len(g.nodes))
for _ in tqdm(range(min_replay_size)):
action=np.random.choice(action_list)
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
if done:
obs=reset()
#main training loop
obs=reset()
episodes=100000
start=1
end=0.1
decay=episodes
gamma=0.99
epsilon=0.5
gamma_list=[]
mean_reward=[]
done_location=[]
loss_list=[]
number_of_episodes=[]
stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}
for i in tqdm(range(episodes)):
itr=0
#epsilon=np.interp(i,[0,decay],[start,end])
#gamma=np.interp(i,[0,decay],[start,end])
epsilon=np.exp(-i/(episodes/3))
rnd_sample=random.random()
stat_dict['episodes'].append(i)
stat_dict['epsilon'].append(epsilon)
#choose an action
if rnd_sample <=epsilon:
action=np.random.choice(action_list)
stat_dict['explore_exploit'].append('explore')
else:
source,end=state_dec(obs)
v_obs=state_to_vector(source,end)
t_obs=T.tensor(v_obs)
action=online.act(t_obs)
stat_dict['explore_exploit'].append('exploit')
#fill transition and append to replay buffer
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
episode_reward+=rew
if done:
obs=reset()
rew_buffer.append(episode_reward)
episode_reward=0.0
done_location.append(i)
#start gradient step
transitions=random.sample(replay_buffer,batch_size)
obses=np.asarray([t[0] for t in transitions])
actions=np.asarray([t[1] for t in transitions])
rews=np.asarray([t[2] for t in transitions])
dones=np.asarray([t[3] for t in transitions])
new_obses=np.asarray([t[4] for t in transitions])
obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)
list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
target_q_values=target(list_new_obses_t)##
max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
targets=rews_t+gamma*(1-dones_t)*max_target_q_values
list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
q_values=online(list_obses_t)
action_q_values=T.gather(input=q_values,dim=1,index=actions_t)
#warning UserWarning: Using a target size (torch.Size([24, 24])) that is different to the input size (torch.Size([24, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
targets=targets.unsqueeze(-1)
loss=nn.functional.mse_loss(action_q_values,targets)
#loss=rmsle(action_q_values,targets)
loss_list.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
#plot
mean_reward.append(np.mean(rew_buffer))
number_of_episodes.append(i)
gamma_list.append(gamma)
dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
#clear_output(wait=True)
#sns.lineplot(data=dec, x="number_of_episodes", y="mean_reward")
#plt.show()
if i % target_update_freq==0:
target.load_state_dict(online.state_dict())
if i % 1000 ==0:
print('step',i,'avg rew',round(np.mean(rew_buffer),2))
pass
现在如果你能从photos
看到
无论是奖励增加还是损失减少,我都尝试了以下
增加和减少学习率
改变 target_update_freq 从 100 1000 1000
我尝试将状态表示从 Onehotvector 更改为 [state, end] 并将其成对发送。
我尝试将损失函数从 mse_loss、smooth_l1、...等
我试着增加集数
向 NN 网络添加另一层
7.changing epsilon 的衰减如何线性,指数
这些解决方案大部分来自 Stacked 上的问题,但对我来说没有任何用处
如何提高性能?还是在另一个病房?如何增加奖励?
看来你的问题只需要调整参数
- 我把你的学习率改成0.02
- 我更改了发送到 NN 的状态的维度
class Network(nn.Module):
def __init__(self,input_dim,n_action):
super(Network,self).__init__()
self.f1=nn.Linear(input_dim,128)
self.f2=nn.Linear(128,64)
self.f3=nn.Linear(64,32)
self.f4=nn.Linear(32,n_action)
#self.optimizer=optim.Adam(self.parameters(),lr=lr)
#self.loss=nn.MSELoss()
self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,x):
x=F.relu(self.f1(x))
x=F.relu(self.f2(x))
x=F.relu(self.f3(x))
x=self.f4(x)
return x
def act(self,obs):
#state=T.tensor(obs).to(device)
state=obs.to(self.device)
actions=self.forward(state)
action=T.argmax(actions).item()
return action
device=T.device('cuda' if T.cuda.is_available() else 'cpu')
print(device)
num_states = len(g.nodes)**2
num_actions = len(g.nodes)
online=Network(num_actions*2,num_actions)
target=Network(num_actions*2,num_actions)
target.load_state_dict(online.state_dict())
optimizer=T.optim.Adam(online.parameters(),lr=1e-2)
enc_node={}
dec_node={}
for index,nd in enumerate(g.nodes):
enc_node[nd]=index
dec_node[index]=nd
def wayenc(current_node,new_node,type=1):
#encoded
if type==1: #distance
if new_node in g[current_node]:
rw=g[current_node][new_node]['weight']*-1
return rw,True
rw=-1000
return rw,False
def rw_function(current,action):
beta=1
current=dec_node[current]
new_node=dec_node[action]
rw0,link=wayenc(current,new_node)
rw1=0
frw=rw0*beta+(1-beta)*rw1
return frw,link
def state_enc(dst, end,n=len(g.nodes)):
return dst+n*end
def state_dec(state,n=len(g.nodes)):
dst = state%n
end = (state-dst)/n
return dst, int(end)
def step(state,action):
done=False
current_node , end = state_dec(state)
new_state = state_enc(action,end)
rw,link=rw_function(current_node,action)
if not link:
new_state = state
return new_state,rw,False
elif action == end:
rw = 10000
done=True
return new_state,rw,done
def reset():
state=state_enc(enc_node[1130166767],enc_node[1731824802])
return state
def state_to_vector(current_node,end_node):
n=len(g.nodes)
source_state_zeros=[0.]*n
source_state_zeros[current_node]=1
end_state_zeros=[0.]*n
end_state_zeros[end_node]=1.
vector=source_state_zeros+end_state_zeros
return vector
#return a list of list converted from state to vectors
def list_of_vecotrs(new_obses_t):
list_new_obss_t=new_obses_t.tolist()
#convert to integer
list_new_obss_t=[int(v) for v in list_new_obss_t]
vector_list=[]
for state in list_new_obss_t:
s,f=state_dec(state)
vector=state_to_vector(s,f)
vector_list.append(vector)
return vector_list
#replay_buffer=[]
rew_buffer=[0]
penalties=[]
episode_reward=0.0
#batch_size=num_actions*2
batch_size=32
buffer_size=50000
min_replay_size=int(buffer_size*0.25)
target_update_freq=1000
flag=0
action_list=np.arange(0,len(g.nodes)).tolist()
replay_buffer=deque(maxlen=min_replay_size)
#populate the experience network
obs=reset()
#obs,end=state_dec(start,len(g.nodes))
for _ in tqdm(range(min_replay_size)):
action=np.random.choice(action_list)
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
if done:
obs=reset()
#main training loop
obs=reset()
episodes=70000
start=1
end=0.1
decay=episodes
gamma=0.99
epsilon=0.5
gamma_list=[]
mean_reward=[]
done_location=[]
loss_list=[]
number_of_episodes=[]
stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}
for i in tqdm(range(episodes)):
itr=0
epsilon=np.exp(-i/(episodes/2))
rnd_sample=random.random()
stat_dict['episodes'].append(i)
stat_dict['epsilon'].append(epsilon)
if rnd_sample <=epsilon:
action=np.random.choice(action_list)
stat_dict['explore_exploit'].append('explore')
else:
source,end=state_dec(obs)
v_obs=state_to_vector(source,end)
t_obs=T.tensor([v_obs])
action=online.act(t_obs)
stat_dict['explore_exploit'].append('exploit')
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
episode_reward+=rew
if done:
obs=reset()
rew_buffer.append(episode_reward)
episode_reward=0.0
done_location.append(i)
batch_size=32
transitions=random.sample(replay_buffer,batch_size)
obses=np.asarray([t[0] for t in transitions])
actions=np.asarray([t[1] for t in transitions])
rews=np.asarray([t[2] for t in transitions])
dones=np.asarray([t[3] for t in transitions])
new_obses=np.asarray([t[4] for t in transitions])
obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)
list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
target_q_values=target(list_new_obses_t)##
#target_q_values=target(obses_t)
max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
targets=rews_t+gamma*(1-dones_t)*max_target_q_values
targets=targets.unsqueeze(-1)
list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
q_values=online(list_obses_t)
#q_values=online(obses_t)
action_q_values=T.gather(input=q_values,dim=1,index=actions_t)
loss=nn.functional.mse_loss(action_q_values,targets)
loss_list.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
mean_reward.append(np.mean(rew_buffer))
number_of_episodes.append(i)
gamma_list.append(gamma)
dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
if i % target_update_freq==0:
target.load_state_dict(online.state_dict())
if i % 1000 ==0:
print('step',i,'avg rew',round(np.mean(rew_buffer),2))
pass
if i==5000:
pass
我做了 运行 这个脚本,它给了我很好的性能改变学习率帮助很大
我正在尝试了解 DQN 是否可以解决最短路径算法
所以我有这个 Dataframe,其中包含一个 source
,它有 nodes id
,end
代表目的地,也有 nodes id
,以及代表边缘距离的权重和然后我将数据框转换为图论如下
DataFrame
source end weight
0 688615041 208456626 15.653688122127072
1 688615041 1799221665 10.092266065922756
2 1799221657 1799221660 8.673942902872051
3 1799221660 1799221665 15.282152665774992
4 1799221660 2003461246 25.85307821157314
5 1799221660 299832604 75.99884525624508
6 299832606 2003461227 4.510148061854331
7 299832606 2003461246 10.954119220974723
8 299832606 2364408910 4.903114362426424
9 1731824802 2003461235 6.812335798968233
10 1799221677 208456626 8.308567154008992
11 208456626 2003461246 14.56512909988425
12 208456626 1250468692 16.416527267975034
13 1011881546 1250468696 12.209773608913697
14 1011881546 2003461246 7.477102764665149
15 2364408910 1130166767 9.780352545373274
16 2364408910 2003461246 6.660771089602594
17 2364408910 2003461237 3.125301826317477
18 2364408911 2003461240 3.836966849565568
19 2364408911 2003461246 6.137847950353395
20 2364408911 2003461247 7.399469477211698
21 2364408911 2003461237 3.90876793066916
22 1250468692 1250468696 8.474825189804282
23 1250468701 2003461247 4.539111170687284
24 2003461235 2003461246 12.400601105777394
25 2003461246 2003461247 12.437602668573737
图表看起来像这样
pos = nx.spring_layout(g)
edge_labels = nx.get_edge_attributes(g, 'weight')
nx.draw(g, pos, node_size=100)
nx.draw_networkx_edge_labels(g, pos, edge_labels, font_size=8)
nx.draw_networkx_labels(g, pos, font_size=10)
plt.title("Syntethic representation of the City")
plt.show()
print('Total number of Nodes: '+str(len(g.nodes)))
graph
现在我使用固定状态的DQN,从节点号1130166767作为开始,到节点号1731824802作为目标。
这是我的全部代码
class Network(nn.Module):
def __init__(self,input_dim,n_action):
super(Network,self).__init__()
self.f1=nn.Linear(input_dim,128)
self.f2=nn.Linear(128,64)
self.f3=nn.Linear(64,32)
self.f4=nn.Linear(32,n_action)
#self.optimizer=optim.Adam(self.parameters(),lr=lr)
#self.loss=nn.MSELoss()
self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,x):
x=F.relu(self.f1(x))
x=F.relu(self.f2(x))
x=F.relu(self.f3(x))
x=self.f4(x)
return x
def act(self,obs):
#state=T.tensor(obs).to(device)
state=obs.to(self.device)
actions=self.forward(state)
action=T.argmax(actions).item()
return action
device=T.device('cuda' if T.cuda.is_available() else 'cpu')
print(device)
num_states = len(g.nodes)*1
### if we need to train a specific set of nodes for ex 10 we *10
num_actions = len(g.nodes)
print("Expected number of States are: "+str(num_states))
print("Expected number of action are: "+str(num_actions))
#num_action*2=when we would like to convert the state into onehotvector we need to concatinate the two vector 22+22
online=Network(num_actions*2,num_actions)
target=Network(num_actions*2,num_actions)
target.load_state_dict(online.state_dict())
optimizer=T.optim.Adam(online.parameters(),lr=5e-4)
#create a dictionary that have encoded index for each node
#to solve this isssu
#reset()=476562122273
#number of state < 476562122273
enc_node={}
dec_node={}
for index,nd in enumerate(g.nodes):
enc_node[nd]=index
dec_node[index]=nd
def wayenc(current_node,new_node,type=1):
#encoded
if type==1: #distance
if new_node in g[current_node]:
rw=g[current_node][new_node]['weight']*-1
return rw,True
rw=-5000
return rw,False
def rw_function(current,action):
#current_node
#new_node
beta=1 #between 1 and 0
current=dec_node[current]
new_node=dec_node[action]
rw0,link=wayenc(current,new_node)
rw1=0
frw=rw0*beta+(1-beta)*rw1
return frw,link
def state_enc(dst, end,n=len(g.nodes)):
return dst+n*end
def state_dec(state,n=len(g.nodes)):
dst = state%n
end = (state-dst)/n
return dst, int(end)
def step(state,action):
done=False
current_node , end = state_dec(state)
new_state = state_enc(action,end)
rw,link=rw_function(current_node,action)
if not link:
new_state = state
return new_state,rw,False
elif action == end:
rw = 10000 #500*12
done=True
return new_state,rw,done
def reset():
state=state_enc(enc_node[1130166767],enc_node[1731824802])
return state
def state_to_vector(current_node,end_node):
n=len(g.nodes)
source_state_zeros=[0.]*n
source_state_zeros[current_node]=1
end_state_zeros=[0.]*n
end_state_zeros[end_node]=1.
vector=source_state_zeros+end_state_zeros
return vector
#return a list of list converted from state to vectors
def list_of_vecotrs(new_obses_t):
list_new_obss_t=new_obses_t.tolist()
#convert to integer
list_new_obss_t=[int(v) for v in list_new_obss_t]
vector_list=[]
for state in list_new_obss_t:
s,f=state_dec(state)
vector=state_to_vector(s,f)
vector_list.append(vector)
return vector_list
#fill the replay buffer
#replay_buffer=[]
rew_buffer=[0]
penalties=[]
episode_reward=0.0
batch_size=num_actions*2
buffer_size=100000
min_replay_size=int(buffer_size*0.20)
target_update_freq=1000
flag=0
action_list=np.arange(0,len(g.nodes)).tolist()
replay_buffer=deque(maxlen=buffer_size)
#populate the experience network
obs=reset()
#obs,end=state_dec(start,len(g.nodes))
for _ in tqdm(range(min_replay_size)):
action=np.random.choice(action_list)
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
if done:
obs=reset()
#main training loop
obs=reset()
episodes=100000
start=1
end=0.1
decay=episodes
gamma=0.99
epsilon=0.5
gamma_list=[]
mean_reward=[]
done_location=[]
loss_list=[]
number_of_episodes=[]
stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}
for i in tqdm(range(episodes)):
itr=0
#epsilon=np.interp(i,[0,decay],[start,end])
#gamma=np.interp(i,[0,decay],[start,end])
epsilon=np.exp(-i/(episodes/3))
rnd_sample=random.random()
stat_dict['episodes'].append(i)
stat_dict['epsilon'].append(epsilon)
#choose an action
if rnd_sample <=epsilon:
action=np.random.choice(action_list)
stat_dict['explore_exploit'].append('explore')
else:
source,end=state_dec(obs)
v_obs=state_to_vector(source,end)
t_obs=T.tensor(v_obs)
action=online.act(t_obs)
stat_dict['explore_exploit'].append('exploit')
#fill transition and append to replay buffer
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
episode_reward+=rew
if done:
obs=reset()
rew_buffer.append(episode_reward)
episode_reward=0.0
done_location.append(i)
#start gradient step
transitions=random.sample(replay_buffer,batch_size)
obses=np.asarray([t[0] for t in transitions])
actions=np.asarray([t[1] for t in transitions])
rews=np.asarray([t[2] for t in transitions])
dones=np.asarray([t[3] for t in transitions])
new_obses=np.asarray([t[4] for t in transitions])
obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)
list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
target_q_values=target(list_new_obses_t)##
max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
targets=rews_t+gamma*(1-dones_t)*max_target_q_values
list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
q_values=online(list_obses_t)
action_q_values=T.gather(input=q_values,dim=1,index=actions_t)
#warning UserWarning: Using a target size (torch.Size([24, 24])) that is different to the input size (torch.Size([24, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
targets=targets.unsqueeze(-1)
loss=nn.functional.mse_loss(action_q_values,targets)
#loss=rmsle(action_q_values,targets)
loss_list.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
#plot
mean_reward.append(np.mean(rew_buffer))
number_of_episodes.append(i)
gamma_list.append(gamma)
dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
#clear_output(wait=True)
#sns.lineplot(data=dec, x="number_of_episodes", y="mean_reward")
#plt.show()
if i % target_update_freq==0:
target.load_state_dict(online.state_dict())
if i % 1000 ==0:
print('step',i,'avg rew',round(np.mean(rew_buffer),2))
pass
现在如果你能从photos
看到无论是奖励增加还是损失减少,我都尝试了以下
增加和减少学习率
改变 target_update_freq 从 100 1000 1000
我尝试将状态表示从 Onehotvector 更改为 [state, end] 并将其成对发送。
我尝试将损失函数从 mse_loss、smooth_l1、...等
我试着增加集数
向 NN 网络添加另一层 7.changing epsilon 的衰减如何线性,指数
这些解决方案大部分来自 Stacked 上的问题,但对我来说没有任何用处
如何提高性能?还是在另一个病房?如何增加奖励?
看来你的问题只需要调整参数
- 我把你的学习率改成0.02
- 我更改了发送到 NN 的状态的维度
class Network(nn.Module):
def __init__(self,input_dim,n_action):
super(Network,self).__init__()
self.f1=nn.Linear(input_dim,128)
self.f2=nn.Linear(128,64)
self.f3=nn.Linear(64,32)
self.f4=nn.Linear(32,n_action)
#self.optimizer=optim.Adam(self.parameters(),lr=lr)
#self.loss=nn.MSELoss()
self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,x):
x=F.relu(self.f1(x))
x=F.relu(self.f2(x))
x=F.relu(self.f3(x))
x=self.f4(x)
return x
def act(self,obs):
#state=T.tensor(obs).to(device)
state=obs.to(self.device)
actions=self.forward(state)
action=T.argmax(actions).item()
return action
device=T.device('cuda' if T.cuda.is_available() else 'cpu')
print(device)
num_states = len(g.nodes)**2
num_actions = len(g.nodes)
online=Network(num_actions*2,num_actions)
target=Network(num_actions*2,num_actions)
target.load_state_dict(online.state_dict())
optimizer=T.optim.Adam(online.parameters(),lr=1e-2)
enc_node={}
dec_node={}
for index,nd in enumerate(g.nodes):
enc_node[nd]=index
dec_node[index]=nd
def wayenc(current_node,new_node,type=1):
#encoded
if type==1: #distance
if new_node in g[current_node]:
rw=g[current_node][new_node]['weight']*-1
return rw,True
rw=-1000
return rw,False
def rw_function(current,action):
beta=1
current=dec_node[current]
new_node=dec_node[action]
rw0,link=wayenc(current,new_node)
rw1=0
frw=rw0*beta+(1-beta)*rw1
return frw,link
def state_enc(dst, end,n=len(g.nodes)):
return dst+n*end
def state_dec(state,n=len(g.nodes)):
dst = state%n
end = (state-dst)/n
return dst, int(end)
def step(state,action):
done=False
current_node , end = state_dec(state)
new_state = state_enc(action,end)
rw,link=rw_function(current_node,action)
if not link:
new_state = state
return new_state,rw,False
elif action == end:
rw = 10000
done=True
return new_state,rw,done
def reset():
state=state_enc(enc_node[1130166767],enc_node[1731824802])
return state
def state_to_vector(current_node,end_node):
n=len(g.nodes)
source_state_zeros=[0.]*n
source_state_zeros[current_node]=1
end_state_zeros=[0.]*n
end_state_zeros[end_node]=1.
vector=source_state_zeros+end_state_zeros
return vector
#return a list of list converted from state to vectors
def list_of_vecotrs(new_obses_t):
list_new_obss_t=new_obses_t.tolist()
#convert to integer
list_new_obss_t=[int(v) for v in list_new_obss_t]
vector_list=[]
for state in list_new_obss_t:
s,f=state_dec(state)
vector=state_to_vector(s,f)
vector_list.append(vector)
return vector_list
#replay_buffer=[]
rew_buffer=[0]
penalties=[]
episode_reward=0.0
#batch_size=num_actions*2
batch_size=32
buffer_size=50000
min_replay_size=int(buffer_size*0.25)
target_update_freq=1000
flag=0
action_list=np.arange(0,len(g.nodes)).tolist()
replay_buffer=deque(maxlen=min_replay_size)
#populate the experience network
obs=reset()
#obs,end=state_dec(start,len(g.nodes))
for _ in tqdm(range(min_replay_size)):
action=np.random.choice(action_list)
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
if done:
obs=reset()
#main training loop
obs=reset()
episodes=70000
start=1
end=0.1
decay=episodes
gamma=0.99
epsilon=0.5
gamma_list=[]
mean_reward=[]
done_location=[]
loss_list=[]
number_of_episodes=[]
stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}
for i in tqdm(range(episodes)):
itr=0
epsilon=np.exp(-i/(episodes/2))
rnd_sample=random.random()
stat_dict['episodes'].append(i)
stat_dict['epsilon'].append(epsilon)
if rnd_sample <=epsilon:
action=np.random.choice(action_list)
stat_dict['explore_exploit'].append('explore')
else:
source,end=state_dec(obs)
v_obs=state_to_vector(source,end)
t_obs=T.tensor([v_obs])
action=online.act(t_obs)
stat_dict['explore_exploit'].append('exploit')
new_obs,rew,done=step(obs,action)
transition=(obs,action,rew,done,new_obs)
replay_buffer.append(transition)
obs=new_obs
episode_reward+=rew
if done:
obs=reset()
rew_buffer.append(episode_reward)
episode_reward=0.0
done_location.append(i)
batch_size=32
transitions=random.sample(replay_buffer,batch_size)
obses=np.asarray([t[0] for t in transitions])
actions=np.asarray([t[1] for t in transitions])
rews=np.asarray([t[2] for t in transitions])
dones=np.asarray([t[3] for t in transitions])
new_obses=np.asarray([t[4] for t in transitions])
obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)
list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
target_q_values=target(list_new_obses_t)##
#target_q_values=target(obses_t)
max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
targets=rews_t+gamma*(1-dones_t)*max_target_q_values
targets=targets.unsqueeze(-1)
list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
q_values=online(list_obses_t)
#q_values=online(obses_t)
action_q_values=T.gather(input=q_values,dim=1,index=actions_t)
loss=nn.functional.mse_loss(action_q_values,targets)
loss_list.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
mean_reward.append(np.mean(rew_buffer))
number_of_episodes.append(i)
gamma_list.append(gamma)
dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
if i % target_update_freq==0:
target.load_state_dict(online.state_dict())
if i % 1000 ==0:
print('step',i,'avg rew',round(np.mean(rew_buffer),2))
pass
if i==5000:
pass
我做了 运行 这个脚本,它给了我很好的性能改变学习率帮助很大