RuntimeError: the derivative for 'indices' is not implemented

Question

我正在按照这个在线教程编写 DQN，https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py ，但是我运行遇到了这个运行时错误，我不确定如何调试或修改以防止此错误。谢谢！

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
     28         agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
     29         obs= obs_
---> 30         agent.learn(batch_size)
     31         lastAction = action
     32     scores.append(score)

<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
     72         Qtarget = Qpred.clone()
     73         print(Qnext[1])
---> 74         Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
     75         # epsilon decay action
     76         if self.steps > 2000:

RuntimeError: the derivative for 'indices' is not implemented

这些是我的 jupyter notebook 中的代码块

class DeepQNetwork(nn.Module):
    def __init__(self,Alpha):
        super(DeepQNetwork,self).__init__()
        self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
        self.conv2 = nn.Conv2d(32,64,4,stride=2)
        self.conv3 = nn.Conv2d(64,128,3)
        self.fc1 = nn.Linear(128* 21* 12,512)
        self.fc2 = nn.Linear(512,6)

        self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
        self.loss = nn.MSELoss()
        self.device =  torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self,obs): 
        '''Passing in a sequence of arrays'''
        obs = torch.Tensor(obs).to(self.device) # send to the GPU
        ''' Feed forward the Network Parameters'''
        obs = obs.view(-1, 1,200,125)
        #print(obs.shape)
        obs = F.relu(self.conv1(obs))
        #print(obs.shape)
        obs = F.relu(self.conv2(obs))
        #print(obs.shape)
        obs = F.relu(self.conv3(obs))
        #print(obs.shape)
        obs = obs.view(-1,128* 21* 12)
        obs = F.relu(self.fc1(obs))
        # 4 Rows and 6 columns
        actions = self.fc2(obs)
        return actions

这是代理代码，它包含导致错误的代码行

class DQNAgent(object):
    def __init__(self, gamma, epsilon, alpha, maxMemory, 
                 epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
        '''
        Gamma -> discount factor of valuing current reward over future reward
        Epsilon -> for trade off between exploration-exploitation
        alpha -> learn rate
        maxMemory -> max size of Memory buffer
        epsEnd -> smallest value of Exploration
        repace -> how often to replace target network
        '''
        self.GAMMA = gamma
        self.EPSILON = epsilon
        self.EPS_END = epsEnd
        self.actionSpace = actionSpace
        self.maxMemory = maxMemory
        self.steps = 0
        self.learn_step_counter = 0
        self.memory = []
        self.memCount = 0
        self.replace_tgt_count = replace
        self.Q_eval = DeepQNetwork(alpha)
        self.Q_next = DeepQNetwork(alpha)

    def storeTransition(self, state, action, reward, state_):
        '''Stores Transition states'''
        if self.memCount < self.maxMemory:
            self.memory.append([state,action,reward,state_])
        else:
            self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
        self.memCount +=1

    def chooseAction(self,obs):
        '''
        Exploration if np.random > epsilon
        else take epsilon greedy action
        '''
        rand = np.random.random()
        # Get the value for all actions for the current set of states
        # Forward pass the stack of frames to get value of each action given subset of staes in obs
        actions = self.Q_eval.forward(obs)
        if rand<1-self.EPSILON:
            action = torch.argmax(actions[1]).item()
        else:
            action = np.random.choice(self.actionSpace)
        self.steps += 1
        return action

    def learn(self, batch_size):
        self.Q_eval.optimizer.zero_grad()
        #0 gradient to do batch optimisation
        if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
            self.Q_next.load_state_dict(self.Q_eval.state_dict())

        # memory subsampling
        if self.memCount + batch_size < self.maxMemory:
            memStart = int(np.random.choice(range(self.memCount)))
        else:
            memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))

        miniBatch = self.memory[memStart:memStart+batch_size]
        memory = np.array(miniBatch)

        #feed forward current state and successor state conv to list as memory is array of numpy objects
        Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
        Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)

        maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
        #calculate rewards
        rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
        # loss for every action except max action to be 0
        Qtarget = Qpred.clone()
        print(Qnext.shape)
        Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
        # epsilon decay action
        if self.steps > 2000:
            if self.EPSILON-1e-4 >self.EPS_END:
                self.EPSILON-= 1e-4
            else:
                self.EPSILON = self.EPS_END
        loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        self.learn_step_counter +=1

env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
        obs = env.reset()
        done = False
        lives = 3
        while not done:
            action = env.action_space.sample()
            obs_ , reward, done, info = env.step(action)
            if done and info['lives']<lives:
                lives = info['lives']
                reward -= 200
            agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
            obs= obs_
initialised = True

scores = []
epsHistory = []
numGames = 50
batch_size = 16

for i in range(numGames):

    print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
    epsHistory.append(agent.EPSILON)
    done = False
    obs = env.reset()

    frames = [np.sum(obs)]
    score = 0
    lastAction = 0
    lives = 3
    while not done:
        if len(frames) == 4:
            action = agent.chooseAction(frames)
            frames = []
        else:
            action = lastAction
        obs_, reward, done, info = env.step(action)
        score += score-reward
        frames.append(preprocess(obs_))
        if done and info['lives'] < lives:
            reward -=200
        agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
        obs= obs_
        agent.learn(batch_size)
        lastAction = action
    scores.append(score)
    print('score: ', score)
    x = [i+1 for i in range(numGames)]

Answer 1

您必须将 .detach() 用于 :

Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)

RuntimeError: the derivative for 'indices' is not implemented

RuntimeError: the derivative for 'indices' is not implemented

reinforcement-learning

pytorch