Stable-Baselines3 日志奖励
Stable-Baselines3 log rewards
如何使用自定义环境将奖励添加到 Stable Baselines3 中的 tensorboard logging?
我有这个学习码
model = PPO(
"MlpPolicy", env,
learning_rate=1e-4,
policy_kwargs=policy_kwargs,
verbose=1,
tensorboard_log="./tensorboard/")
与 by their documentation 一样,您可以通过创建自己的回调来记录任意值:
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
class TensorboardCallback(BaseCallback):
"""
Custom callback for plotting additional values in tensorboard.
"""
def __init__(self, verbose=0):
super(TensorboardCallback, self).__init__(verbose)
def _on_step(self) -> bool:
# Log scalar value (here a random variable)
value = np.random.random()
self.logger.record('random_value', value)
return True
model.learn(50000, callback=TensorboardCallback())
您可以使用 self.locals
访问记录器回调可用的局部变量。自定义环境中公开的任何变量都可以通过 locals dict 访问。
下面的示例展示了如何在矢量化环境中访问名为 my_custom_info_dict
的自定义词典中的键。
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.vec_env import SubprocVecEnv
def make_env(env):
"""
See https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/multiprocessing_rl.ipynb
for more details on vectorized environments
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environment you wish to have in subprocesses
:param seed: (int) the inital seed for RNG
:param rank: (int) index of the subprocess
:return: (Callable)
"""
def _init():
return env
return _init
class SummaryWriterCallback(BaseCallback):
'''
Snippet skeleton from Stable baselines3 documentation here:
https://stable-baselines3.readthedocs.io/en/master/guide/tensorboard.html#directly-accessing-the-summary-writer
'''
def _on_training_start(self):
self._log_freq = 10 # log every 10 calls
output_formats = self.logger.output_formats
# Save reference to tensorboard formatter object
# note: the failure case (not formatter found) is not handled here, should be done with try/except.
self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))
def _on_step(self) -> bool:
'''
Log my_custom_reward every _log_freq(th) to tensorboard for each environment
'''
if self.n_calls % self._log_freq == 0:
rewards = self.locals['my_custom_info_dict']['my_custom_reward']
for i in range(self.locals['env'].num_envs):
self.tb_formatter.writer.add_scalar("rewards/env #{}".format(i+1),
rewards[i],
self.n_calls)
if __name__ == "__main__":
env_id = "CartPole-v1"
envs = SubprocVecEnv([make_env(env_id, i) for i in range(4)]) # 4 environments
model = SAC("MlpPolicy", envs, tensorboard_log="/tmp/sac/",
verbose=1)
model.learn(50000, callback=TensorboardCallback())
如何使用自定义环境将奖励添加到 Stable Baselines3 中的 tensorboard logging?
我有这个学习码
model = PPO(
"MlpPolicy", env,
learning_rate=1e-4,
policy_kwargs=policy_kwargs,
verbose=1,
tensorboard_log="./tensorboard/")
与 by their documentation 一样,您可以通过创建自己的回调来记录任意值:
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
class TensorboardCallback(BaseCallback):
"""
Custom callback for plotting additional values in tensorboard.
"""
def __init__(self, verbose=0):
super(TensorboardCallback, self).__init__(verbose)
def _on_step(self) -> bool:
# Log scalar value (here a random variable)
value = np.random.random()
self.logger.record('random_value', value)
return True
model.learn(50000, callback=TensorboardCallback())
您可以使用 self.locals
访问记录器回调可用的局部变量。自定义环境中公开的任何变量都可以通过 locals dict 访问。
下面的示例展示了如何在矢量化环境中访问名为 my_custom_info_dict
的自定义词典中的键。
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.vec_env import SubprocVecEnv
def make_env(env):
"""
See https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/multiprocessing_rl.ipynb
for more details on vectorized environments
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environment you wish to have in subprocesses
:param seed: (int) the inital seed for RNG
:param rank: (int) index of the subprocess
:return: (Callable)
"""
def _init():
return env
return _init
class SummaryWriterCallback(BaseCallback):
'''
Snippet skeleton from Stable baselines3 documentation here:
https://stable-baselines3.readthedocs.io/en/master/guide/tensorboard.html#directly-accessing-the-summary-writer
'''
def _on_training_start(self):
self._log_freq = 10 # log every 10 calls
output_formats = self.logger.output_formats
# Save reference to tensorboard formatter object
# note: the failure case (not formatter found) is not handled here, should be done with try/except.
self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))
def _on_step(self) -> bool:
'''
Log my_custom_reward every _log_freq(th) to tensorboard for each environment
'''
if self.n_calls % self._log_freq == 0:
rewards = self.locals['my_custom_info_dict']['my_custom_reward']
for i in range(self.locals['env'].num_envs):
self.tb_formatter.writer.add_scalar("rewards/env #{}".format(i+1),
rewards[i],
self.n_calls)
if __name__ == "__main__":
env_id = "CartPole-v1"
envs = SubprocVecEnv([make_env(env_id, i) for i in range(4)]) # 4 environments
model = SAC("MlpPolicy", envs, tensorboard_log="/tmp/sac/",
verbose=1)
model.learn(50000, callback=TensorboardCallback())