py_environment 'time_step' 不匹配 'time_step_spec'
py_environment 'time_step' doesn't match 'time_step_spec'
我已经通过 tf agents 创建了一个自定义的 pyenvironment。但是我无法使用 py_policy.action 验证环境或在其中采取步骤
我对 time_step_specs
中排除的内容感到困惑
我已经尝试通过 tf_py_environment.TFPyEnvironment 转换为 tf_py_environment 并成功地使用 tf_policy 采取行动,但我仍然对其中的区别感到困惑。
import abc
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import random_tf_policy
import tensorflow as tf
import tf_agents
class TicTacToe(py_environment.PyEnvironment):
def __init__(self,n):
super(TicTacToe,self).__init__()
self.n = n
self.winner = None
self._episode_ended = False
self.inital_state = np.zeros((n,n))
self._state = self.inital_state
self._observation_spec = array_spec.BoundedArraySpec(
shape = (n,n),dtype='int32',minimum = -1,maximum = 1,name =
'TicTacToe board state spec')
self._action_spec = array_spec.BoundedArraySpec(
shape = (),dtype = 'int32', minimum = 0,maximum = 8, name =
'TicTacToe action spec')
def observation_spec(self):
return self._observation_spec
def action_spec(self):
return self._action_spec
def _reset(self):
return ts.restart(self.inital_state)
def check_game_over(self):
for i in range(self.n):
if (sum(self._state[i,:])==self.n) or
(sum(self._state[:,i])==self.n):
self.winner = 1
return True
elif (sum(self._state[i,:])==-self.n) or
(sum(self._state[:,i])==-self.n):
self.winner = -1
return True
if (self._state.trace()==self.n) or
(self._state[::-1].trace()==self.n):
self.winner = 1
return True
elif (self._state.trace()==-self.n) or (self._state[::-1].trace()==-
self.n):
self.winner = -1
return True
if not (0 in self._state):
return True
def _step(self,action):
self._state[action//3,action%3]=1
self._episode_ended = self.check_game_over
if self._episode_ended==True:
if self.winner == 1:
reward = 1
elif self.winner == None:
reward = 0
else:
reward = -1
return ts.termination(self._state,dtype = 'int32',reward=reward)
else:
return ts.transition(self._state,dtype = 'int32',reward =
0.0,discount = 0.9)
env = TicTacToe(3)
utils.validate_py_environment(env, episodes=5)
这是我得到的错误:
ValueError Traceback(最后一次调用)
在
----> 1 utils.validate_py_environment(环境,剧集=5)
C:\Users\bzhang\AppData\Local\Continuum\anaconda3\lib\site-packages\tf_agents\environments\utils.py in validate_py_environment(环境,剧集)
58 提高 ValueError(
59'Given time_step
: %r does not match expected time_step_spec
: %r'%
---> 60 (time_step, time_step_spec))
61
62 行动 = random_policy.action(time_step).行动
ValueError: 给出 time_step
: TimeStep(step_type=array(0), reward=array(0., dtype=float32), discount=array(1., dtype=float32) , 观察=数组([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])) 与预期的 time_step_spec
不匹配:TimeStep(step_type=ArraySpec(shape=(), dtype=dtype('int32'), name ='step_type'), reward=ArraySpec(shape=(), dtype=dtype('float32'), name='reward'), discount=BoundedArraySpec(shape=(), dtype=dtype( 'float32'), name='discount', minimum=0.0, maximum=1.0), observation=BoundedArraySpec(shape=(3, 3), dtype=dtype('int32'), name='TicTacToe board state spec', 最小值=-1, 最大值=1))
您的观察结果与规范不符,您需要将 dtype=np.int32
传递给 np 数组以确保类型匹配。
我已经通过 tf agents 创建了一个自定义的 pyenvironment。但是我无法使用 py_policy.action 验证环境或在其中采取步骤 我对 time_step_specs
中排除的内容感到困惑我已经尝试通过 tf_py_environment.TFPyEnvironment 转换为 tf_py_environment 并成功地使用 tf_policy 采取行动,但我仍然对其中的区别感到困惑。
import abc
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import random_tf_policy
import tensorflow as tf
import tf_agents
class TicTacToe(py_environment.PyEnvironment):
def __init__(self,n):
super(TicTacToe,self).__init__()
self.n = n
self.winner = None
self._episode_ended = False
self.inital_state = np.zeros((n,n))
self._state = self.inital_state
self._observation_spec = array_spec.BoundedArraySpec(
shape = (n,n),dtype='int32',minimum = -1,maximum = 1,name =
'TicTacToe board state spec')
self._action_spec = array_spec.BoundedArraySpec(
shape = (),dtype = 'int32', minimum = 0,maximum = 8, name =
'TicTacToe action spec')
def observation_spec(self):
return self._observation_spec
def action_spec(self):
return self._action_spec
def _reset(self):
return ts.restart(self.inital_state)
def check_game_over(self):
for i in range(self.n):
if (sum(self._state[i,:])==self.n) or
(sum(self._state[:,i])==self.n):
self.winner = 1
return True
elif (sum(self._state[i,:])==-self.n) or
(sum(self._state[:,i])==-self.n):
self.winner = -1
return True
if (self._state.trace()==self.n) or
(self._state[::-1].trace()==self.n):
self.winner = 1
return True
elif (self._state.trace()==-self.n) or (self._state[::-1].trace()==-
self.n):
self.winner = -1
return True
if not (0 in self._state):
return True
def _step(self,action):
self._state[action//3,action%3]=1
self._episode_ended = self.check_game_over
if self._episode_ended==True:
if self.winner == 1:
reward = 1
elif self.winner == None:
reward = 0
else:
reward = -1
return ts.termination(self._state,dtype = 'int32',reward=reward)
else:
return ts.transition(self._state,dtype = 'int32',reward =
0.0,discount = 0.9)
env = TicTacToe(3)
utils.validate_py_environment(env, episodes=5)
这是我得到的错误:
ValueError Traceback(最后一次调用) 在 ----> 1 utils.validate_py_environment(环境,剧集=5)
C:\Users\bzhang\AppData\Local\Continuum\anaconda3\lib\site-packages\tf_agents\environments\utils.py in validate_py_environment(环境,剧集)
58 提高 ValueError(
59'Given time_step
: %r does not match expected time_step_spec
: %r'%
---> 60 (time_step, time_step_spec))
61
62 行动 = random_policy.action(time_step).行动
ValueError: 给出 time_step
: TimeStep(step_type=array(0), reward=array(0., dtype=float32), discount=array(1., dtype=float32) , 观察=数组([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])) 与预期的 time_step_spec
不匹配:TimeStep(step_type=ArraySpec(shape=(), dtype=dtype('int32'), name ='step_type'), reward=ArraySpec(shape=(), dtype=dtype('float32'), name='reward'), discount=BoundedArraySpec(shape=(), dtype=dtype( 'float32'), name='discount', minimum=0.0, maximum=1.0), observation=BoundedArraySpec(shape=(3, 3), dtype=dtype('int32'), name='TicTacToe board state spec', 最小值=-1, 最大值=1))
您的观察结果与规范不符,您需要将 dtype=np.int32
传递给 np 数组以确保类型匹配。