Stable-Baselines3 日志奖励

Stable-Baselines3 log rewards

如何使用自定义环境将奖励添加到 Stable Baselines3 中的 tensorboard logging?

我有这个学习码

model = PPO(
    "MlpPolicy", env,
    learning_rate=1e-4,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./tensorboard/")

by their documentation 一样,您可以通过创建自己的回调来记录任意值:

import numpy as np

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback

model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)


class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        # Log scalar value (here a random variable)
        value = np.random.random()
        self.logger.record('random_value', value)
        return True


model.learn(50000, callback=TensorboardCallback())

您可以使用 self.locals 访问记录器回调可用的局部变量。自定义环境中公开的任何变量都可以通过 locals dict 访问。

下面的示例展示了如何在矢量化环境中访问名为 my_custom_info_dict 的自定义词典中的键。

import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.vec_env import SubprocVecEnv

def make_env(env): 
    """
    See https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/multiprocessing_rl.ipynb
    for more details on vectorized environments

    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init():
        return env
    return _init

class SummaryWriterCallback(BaseCallback):
    '''
    Snippet skeleton from Stable baselines3 documentation here:
    https://stable-baselines3.readthedocs.io/en/master/guide/tensorboard.html#directly-accessing-the-summary-writer
    '''

    def _on_training_start(self):
        self._log_freq = 10  # log every 10 calls

        output_formats = self.logger.output_formats
        # Save reference to tensorboard formatter object
        # note: the failure case (not formatter found) is not handled here, should be done with try/except.
        self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))

    def _on_step(self) -> bool:
        '''
        Log my_custom_reward every _log_freq(th) to tensorboard for each environment
        '''
        if self.n_calls % self._log_freq == 0:
            rewards = self.locals['my_custom_info_dict']['my_custom_reward']
            for i in range(self.locals['env'].num_envs):
                self.tb_formatter.writer.add_scalar("rewards/env #{}".format(i+1),
                                                     rewards[i],
                                                     self.n_calls)

if __name__ == "__main__":
    env_id = "CartPole-v1"
    envs = SubprocVecEnv([make_env(env_id, i) for i in range(4)]) # 4 environments
    model = SAC("MlpPolicy", envs, tensorboard_log="/tmp/sac/", 
    verbose=1)
    model.learn(50000, callback=TensorboardCallback())