为什么我的模型不学习？非常高的损失

Question

我建立了一个模拟模型，其中卡车根据垃圾箱的填充量收集垃圾箱。我使用 OpenAi Gym 和 Tensorflow/keras 创建了我的深度强化学习模型...... 但是我的训练有很高的损失... 我哪里做错了？提前致谢

这是环境

class Marltf(Env):
    def __init__(self):
       
        self.i= 0
        self.containers1 = Container(3,3)
        self.containers2 = Container(1,3)
        self.containers3 = Container(3,1)
        self.containers4 = Container(5,6)
        self.containers5 = Container(8,6)
        self.containers6 = Container(10,10)
        self.containers7 = Container(11,11)
        self.containers8 = Container(7,12) 
        self.passo = 0
        self.containers2.lv = 2
        self.containers3.lv = 4
        self.containers5.lv = 4
        self.containers6.lv = 1
        self.containers8.lv = 2
        self.shower_length= 300
        
        self.containers = [self.containers1,self.containers2,self.containers3,self.containers4, self.containers5, self.containers6, self.containers7, self.containers8]
        self.positions ={}
        self.capacities ={}
        self.camions= []
        b = 0
        for cont in self.containers:
            b += cont.lv
        reward = 0
        nCamionFloat = 0
        while b > 6:
          b +=-10
          nCamionFloat +=1
        nCamionInt = int(nCamionFloat)
       
        for ic in range(nCamionInt):
          self.camions.append(Camion(1,1,None,ic))


        for cam in self.camions:
          
          self.positions[cam.name] = cam.position  
          self.capacities[cam.name] = 10
        
        
        self.frames = []
        self.cnt=0  


        self.mapp = Map(15,15,self.camions,self.containers)

        self.state = (15*15)/5
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = Box(low = np.array([0]), high= np.array([51]))

    def step(self, action):
      
        moves = {0: (-1, 0),1: (1, 0),2: (0, -1),3: (0, 1)}
        
        done = False
       
        ic = 0   
        for cam in self.camions: 
            cam.position = (self.positions[ic][0],self.positions[ic][1])            
            cam.capacity = self.capacities[ic] 
            
            self.state += -5
            

        mossa = moves[action]
        x=self.camions[self.i].position
        reward = 0
        nuovaposizione = [mossa[0] + x[0],mossa[1] +x[1]]
        self.shower_length -= 1 
        if self.mapp.mapp[nuovaposizione[0],nuovaposizione[1]] == -1:
          reward += -5
          self.state += -5
        
        else:
            self.mapp.mapp[x[0],x[1]] = 0
            self.camions[self.i].position=nuovaposizione
            self.mapp.mapp[nuovaposizione[0],nuovaposizione[1]] = 9
            self.positions.update({self.camions[self.i].name : nuovaposizione})
           
            
            
            
            reward += -1
            self.state = -2
            

        for contain in self.containers:
                  if self.camions[self.i].position[0] == contain.position[0] and camion.position[1] == contain.position[1] :
                        
                        if contain.lv ==3 and self.camions[self.i].capacity >=3:
                            self.camions[self.i].reward += 100
                            self.camions[self.i].capacity += -3
                            self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
                            reward +=20
                            
                            self.state +=20
                         
                            contain.lv=0

                        elif contain.lv == 2 and self.camions[self.i].capacity >=2:
                            self.camions[self.i].reward += 50
                            self.camions[self.i].capacity += -2
                            self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
                            self.state +=10
                          
                            reward += 50
                            
                            contain.lv=0

                        elif contain.lv == 1 and self.camions[self.i].capacity >=1:
                            
                            reward += 10
                            self.camions[self.i].reward +=5
                            self.camions[self.i].capacity += -1
                            self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
                            contain.lv=0
                            self.state+=1
                           
                        elif contain.lv==4 and self.camions[self.i].capacity >=4:
                            reward +=50
                            self.camions[self.i].reward +=50
                            self.camions[self.i].capacity += -4
                            self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
                            self.state +=50
                            contain.lv=0
                           
                          
                        elif contain.lv==0 and self.camions[self.i].capacity >=4:
                            reward += -20
                            self.camions[self.i].reward +=-20
                            self.camions[self.i].capacity += 0
                            self.state += -20
                            contain.lv=0
                         
                        
                  if self.camions[self.i].capacity <=2:
                              self.camions[self.i].positions=(1,1)
                              self.positions.update({self.camions[self.i].name : (1,1)})

                              self.camions[self.i].capacity = 10
                              self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})

                  
                  

                          

        if self.i ==1:
                      self.i= 0              
                      self.i = 0
                      self.i = 0
        elif self.i ==0:
                      self.i= 1


        if self.shower_length <= 0: 
            done = True
        else:
            done = False
        

        self.passo +=1
       
        
        
        
        

        info = {}
        
        return self.state,reward,done,info



    def render(self, mode="human"):
           
            BLACK = (0, 0, 0)
            WHITE = (200, 200, 200)
            
            WINDOW_HEIGHT = len(self.mapp.mapp[0]) *50
            WINDOW_WIDTH = len(self.mapp.mapp[0]) *50
           
            whiteC=pygame.image.load('white.jpg')
            whiteC=pygame.transform.scale(whiteC,(50, 50))
           
            greenC=pygame.image.load('green.jpg')
            greenC=pygame.transform.scale(greenC,(50, 50))
            
            yellowC=pygame.image.load('yellow.jpg')
            yellowC=pygame.transform.scale(yellowC,(50, 50))

            orangeC=pygame.image.load('orange.jpg')
            orangeC=pygame.transform.scale(orangeC,(50, 50))

            redC=pygame.image.load('red.jpg')
            redC=pygame.transform.scale(redC,(50, 50))

            
            gT=pygame.image.load('greenCamion.jpg')
            gT=pygame.transform.scale(gT,(50, 50))

            yT=pygame.image.load('yellowCamion.jpg')
            yT=pygame.transform.scale(yT,(50, 50))

            rT=pygame.image.load('redCamion.jpg')
            rT=pygame.transform.scale(rT,(50, 50))
            
           
            
            
            global SCREEN, CLOCK
            pygame.init()
            SCREEN = pygame.display.set_mode((WINDOW_WIDTH, WINDOW_HEIGHT))
            CLOCK = pygame.time.Clock()
            SCREEN.fill(BLACK)
            
            pygame.draw.rect(SCREEN, WHITE, pygame.Rect( 10, 0, 50, 50))
            blockSize = 50 #Set the size of the grid block
            
            for i in range(0,len(self.mapp.mapp[0])):
              for j in range(0,len(self.mapp.mapp[0])):
                      a=i*50
                      b=j*50
                    
                      if self.mapp.mapp[i][j] == -1:
                        pygame.draw.rect(SCREEN, WHITE, pygame.Rect( a, b, 50, 50))

            for c in self.camions :
              if c.capacity > 6:
                SCREEN.blit(gT, (c.position[0]*50, c.position[1]*50))
              
              if c.capacity > 3 and c.capacity <= 6:
                SCREEN.blit(yT, (c.position[0]*50, c.position[1]*50))     
              
              if c.capacity <= 3:
                SCREEN.blit(rT, (c.position[0]*50, c.position[1]*50))
            
            
            for contain in self.containers :
              if contain.lv == 0:
                 SCREEN.blit(whiteC,(contain.position[0]*50 , contain.position[1]*50))
              
              elif contain.lv == 1:
                  SCREEN.blit(greenC,(contain.position[0]*50 , contain.position[1]*50))
              
              elif contain.lv == 2:
                  SCREEN.blit(yellowC,(contain.position[0]*50 , contain.position[1]*50))
              
              elif contain.lv == 3:
                 SCREEN.blit(orangeC,(contain.position[0]*50 , contain.position[1]*50))
              
              if contain.lv == 4:
                 SCREEN.blit(redC,(contain.position[0]*50 , contain.position[1]*50))
              
                
            
            for x in range(0, WINDOW_WIDTH, blockSize):
                for y in range(0, WINDOW_HEIGHT, blockSize):
                    rect = pygame.Rect(x, y, blockSize, blockSize)
                    pygame.draw.rect(SCREEN, WHITE, rect, 1)

            pygame.display.flip()

            view = pygame.surfarray.array3d(SCREEN)
            view = view.transpose([1, 0, 2])

            img_bgr = cv2.cvtColor(view, cv2.COLOR_RGB2BGR)
           
            
            
            
            
            pygame.image.save(SCREEN, f"screenshot{self.cnt}.png")
            self.cnt +=1
            pygame.event.get()
                    


       
    def reset(self):
        self.state = (15*15)/4
        self.shower_length = 300
        
        self.containers1.lv=3
        self.containers2.lv=1
        self.containers7.lv = 2 
        self.containers3.lv = 4
        self.containers5.lv = 4
        self.containers6.lv = 1
        self.containers8.lv = 2
        self.passo = 0
        self.positions ={}
        self.capacities ={}
        self.camions= []
        b = 0
        for cont in self.containers:
            b += cont.lv
        reward = 0
        nCamionFloat = 0
        while b > 6:
          b +=-10
          nCamionFloat +=1
        nCamionInt = int(nCamionFloat)
      
        for ic in range(nCamionInt):
          self.camions.append(Camion(1,1,None,ic))


        for cam in self.camions:
          
          self.positions[cam.name] = cam.position  
          self.capacities[cam.name] = 10
        
        self.shower_length =60
        self.cnt=0  
        self.i = 0
            
            
        




        containers = [    containers1,    containers2,    containers3,    containers4]
        containers.append(    containers1)

states = env.observation_space.shape
actions = env.action_space.n
b = env.action_space.sample()

我的模型

def build_model(states,actions):
  model = tf.keras.Sequential([
      keras.layers.Dense(64, input_shape=states),
      keras.layers.LeakyReLU(0.24,),
      keras.layers.Dense(64),
      keras.layers.LeakyReLU(0.24,),
      keras.layers.Dense(32),
      keras.layers.LeakyReLU(0.24,),
  
      keras.layers.Dense(16),
      keras.layers.LeakyReLU(0.24,),
      keras.layers.Dense(8),
      keras.layers.LeakyReLU(0.24,),
      
      keras.layers.Dense(actions, activation='linear'),
      
])
  return model



model = build_model(states, actions)
model.compile(loss='mse',  metrics=['accuracy'])

def build_agent(model, actions):
      policy = GreedyQPolicy()
      memory = SequentialMemory(limit=10000, window_length=1)
      dqn = DQNAgent(model=model, memory=memory, policy=policy,nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
      
      return dqn



dqn = build_agent(model, actions)
dqn.compile(tf.keras.optimizers.Adadelta(
    learning_rate=0.1, rho=0.95, epsilon=1e-07, name='Adadelta'), metrics= ["accuracy"] 
)

a =dqn.fit(env, nb_steps=5000, visualize=True, verbose=2,)

亏损从50开始到200

Answer 1

损失在 RL 中并不重要。非常高的损失其实很正常。在 RL 中，我们最关心奖励。

Answer 2

在强化学习中你通常不关心损失，而是回报。从 class 的名字看，它似乎也是一个多智能体强化学习问题，通常更难处理 w.r.t 单智能体问题。

我要尝试更改的第一件事是步数：5000 非常低。尝试定义一个 episode，如果还没有定义，然后绘制 episode 结束时的累积奖励，并检查累积奖励是否随着 episode 的增加而增加。

这是检查奖励是否实际增加以及代理是否正在学习的最简洁的方法。

为什么我的模型不学习？非常高的损失

Why does my model not learn? Very high loss

reinforcement-learning

deep-learning

keras

tensorflow

openai-gym