Keras fit 需要很多时间

Keras fit takes so much time

我最近在学习深度强化学习,我想将我学到的知识应用到使用 Keras 的健身房解决问题。

在训练过程中我意识到它太慢了,在检查原因后我看到“fit”函数需要那么多时间。

运行每集时长 3-4 分钟。

我做的有什么问题吗?或者您能提出改进建议吗?

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import gym
import datetime

class DQN():
    def __init__(self, env):
        self.env = env
        self.memory = deque(maxlen=2000)

        self.gamma = 0.98
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.998
        self.learning_rate = 0.001

        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = keras.Sequential()
        state_shape = self.env.observation_space.shape
        model.add(keras.layers.Dense(48, activation="relu", input_dim=state_shape[0]))
        model.add(keras.layers.Dense(24, activation="relu"))
        model.add(keras.layers.Dense(self.env.action_space.n, activation="relu"))
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])
    
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return
        
        samples = random.sample(self.memory, batch_size)
        # states, actions, rewards, states_, dones = samples
        # targets = self.target_model.predict(states)
        # _states = [i for i in range(len(samples))]
        # targets = [[0 for j in range(self.env.action_space.n)] for i in range(len(samples))]
        _states = np.zeros((len(samples), 8))
        targets = np.zeros((len(samples), self.env.action_space.n))

        for i, sample in enumerate(samples):
            state, action, reward, new_state, done = sample
            _states[i] = state
            # target = self.target_model.predict(state)
            if done:
                targets[i][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                targets[i][action] = reward + Q_future*self.gamma

        self.model.fit(_states, targets, epochs=1, verbose=0)

         
             
        # for sample in samples:
        #     state, action, reward, new_state, done = sample
        #     target = self.target_model.predict(state)
        #     if done:
        #         target[0][action] = reward
        #     else:
        #         Q_future = max(self.target_model.predict(new_state)[0])
        #         target[0][action] = reward + Q_future*self.gamma

        #         start_time = datetime.datetime.now()
        #         self.model.fit(state, target, epochs=1, verbose=0)
        #         end_time = datetime.datetime.now()
        #         print("--fit--")
        #         print(end_time-start_time)

            

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)
    
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def save_model(self, fn):
        self.model.save(fn)

    def act_eval(self, state):
        return np.argmax(self.model.predict(state)[0])

    def evaluation(self, n_eval=10):
        total_reward = 0
        for _ in range(n_eval):
            self.env.reset()
            cur_state = self.env.reset().reshape(1,8)
            done = False
            while not done:
                action = self.act_eval(cur_state)
                new_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                cur_state = new_state.reshape(1,8)
        
        return total_reward / n_eval



def main():
    save_path = "policies/"
    env = gym.make("LunarLander-v2")
    
    trials = 2000
    trial_len = 500

    update_target_network = 500
    agent = DQN(env=env)
    for trial in range(trials):
        cur_state = env.reset().reshape(1,8)
        time_step_cntr = 0


        # check execution durations
        dur_replay = 0
        dur_step = 0
        dur_act = 0


        for step in range(trial_len):
            print("Trial {0}, step {1}".format(trial, step))
            action = agent.act(cur_state) # 



            new_state, reward, done, _ = env.step(action) # 

            new_state = new_state.reshape(1,8)
            agent.remember(cur_state, action, reward, new_state, done)

            # learn from experience
            agent.replay() # 

            # after "update_target_network" steps, update target network
            if time_step_cntr % update_target_network == 0:
                agent.target_train()
            time_step_cntr += 1

            cur_state = new_state
            if done:
                break
        
        # print("Duration replay {0}, duration act {1}, duration step {2}".format(dur_replay, dur_act, dur_step))
        
        # at each N steps, evaluate
        print("Evaluation over 10 episodes", agent.evaluation())

        
        print("Trial #{0} completed.".format(trial))
        # # print the progress
        # if trial % 100 == 0:
        #     print("Trial #{0} completed.".format(trial))

        # save the model
        # if trial % 20 == 0:
        agent.save_model(save_path + str(trial) + "__.model")

    agent.save_model(save_path + "_final" + "__.model")

if __name__ == "__main__":
    main()

您的问题不在 fit 调用中,而是在 replay() 方法中的循环中。 在这些情况下,请始终尝试用 numpy 操作替换循环,即使操作更加敏捷。

用以下方法替换您的重播方法,让我知道它是否对您更快

def replay(self):
    batch_size = 32
    if len(self.memory) >= batch_size:
        # Draw a sample
        samples = random.sample(self.memory, batch_size)
        
        # Prepare the batch
        state, action, reward, new_state, done = zip(*samples)
        next_state = np.concatenate(new_state)
        done = np.array(done)[:,None]
        state = np.concatenate(state)
        reward = np.array(reward)[:,None]
        q_future = self.target_model.predict(next_state)
        targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)*(1-done)
        
        # Fit the model
        self.model.fit(state, targets, epochs=1, verbose=0)