python – PY_environment ‘time_step’ doesn’t match ‘time_step_spec’ – I can’t spot the difference

I’m trying to create a custom tf-agents environment for trading. When I try to validate it by calling utils.validate_py_environment(environment, episodes=1), I’m getting a ValueError ‘time_step’ doesn’t match ‘time_step_spec’ . I’ve been trying to spot the difference for a while now but I can’t seem to find it. Am I missing something?

My Environment

class TradingEnv(py_environment.PyEnvironment):
def __init__(self, df,lkb,normalize=False,render=False,stp=1):

 # spaces
#  Actions: We have 3 actions.
#  Action 0:  skip, 
#  Action 1:  open position,
#  Action 2:  close open position

self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=2, name="_action")

#representation of the enviroment: price + open position state
self._observation_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0, name="_observation")

self.lkb = lkb
self.df = df
self.prices = self._process_data()
self.normalize = normalize
self.stp = stp
self._episode_ended = False

self.lkb = lkb       # days of history are available from the beginning
self._maxDayIndex = len(self.prices)-1    # maximun days of trading investment
self._currentDayIndex = lkb # current day idex -> starts with 0
self._state=None
self._initialize_state()
self.positions = []
self.temp_position = []
self.reward = 0
self.current_price = None
self.position = 0    # 0 if no open position  1 if a position is open


def action_spec(self):
return self._action_spec


def observation_spec(self):
return self._observation_spec



def _reset(self):
print('reset')
self.prices = self._process_data()
self._currentDayIndex = self.lkb
self._maxDayIndex = len(self.prices)-1
self._state=None
self._episode_ended = False
self.positions = []
self.position = 0
self._initialize_state()

return ts.restart(np.array(self._state, dtype=np.float32))


 def _step(self, action):
print(self._currentDayIndex)

#if we have a last observation day
if self._episode_ended:
  print('Episode ended')
  # The last action ended the episode. Ignore the current action and 
  start
  # a new episode.
  return self._reset()

#go to the next day
self._update_state()

#select the action
if(0 == action): #skip
  if(self.position ==1): #position is open
    pass
  else:
    reward = -10 # if no position set negative reward to stimulate agent
  print('skip')

elif(1 == action): #open position
  if self.position == 0: # only if there is no open position
    print('opened position')
    self.temp_position.append(self._currentDayIndex)
    self.position = 1
    reward = 0
  else: #  if there is open position already
    reward = -10 # punish for illegal action, cant open a position if there is already open position




elif(2 == action): #close position
  # reward, actionPnl, action = self.__open_position(-1, action, _idx)
  # can close a position if there is open position
  if self.position == 1:
    print('close position')
    self.temp_position.append(self._currentDayIndex) 
    self.positions.append(self.temp_position)
    reward = self._calculate_reward()
    self.temp_position = []
    self.position = 0
  else:
    reward = -10 # punish for illegal action, cant close the position if there is no open position




if self._currentDayIndex == self._maxDayIndex: #if we have a last observation day
    self._episode_ended = True
    print('Episode ended')

if self._episode_ended: 
    return ts.termination(np.array(self._state, dtype=np.float32), reward)
else:
    return ts.transition(np.array(self._state, dtype=np.float32) , reward=0, discount=1.0)

def calculate_reward(self): # get trade series trade_series = self.prices[self.temp_position[0]:self.temp_position[1]]max = max(trade_series) min_ = min(trade_series) min_diff = abs((trade_series)[0] – min_) / trade_series[0] * 100) max_diff = abs((trade_series[0] – max_) / trade_series[0] * 100) against = max(min_diff, max_diff)/min(min_diff, max_diff)
against = round(against,1) nr_against = against/self.stp reward = 100 – (100 * nr_against)

return reward

def _initialize_state(self): self._state=self.prices[(self._currentDayIndex-self.lkb):self._currentDayIndex+1]
self.current_price = self.prices[self._currentDayIndex]
print(self._state) # normalize series if self.normalize: self._state = (self._state-min(self._state))/(max(self._state)-min(self._state))

def _update_state(self): self._currentDayIndex += 1 self._state=self.prices[(self._currentDayIndex-self.lkb):self._currentDayIndex+1]
self.current_price = self.prices[self._currentDayIndex]

# normalize series
if self.normalize:
  self._state = (self._state-min(self._state))/(max(self._state)-min(self._state))

def render(self, seed=None): pass

def _process_data(self): prices = self.df.loc[:, ‘Close’].to_numpy() return prices

Observation Action Spec

self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=2, name="_action")

#representation of the enviroment: price + open position state
self._observation_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0, name="_observation")

The Error Message

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-299-ac96dfe7d041> in <module>()
  1 env = TradingEnv(df=df.head(10), lkb=5,normalize=False,render=True)
----> 2 utils.validate_py_environment(env, episodes=1)

/usr/local/lib/python3.7/dist-packages/tf_agents/environments/utils.py in 
validate_py_environment(environment, episodes, observation_and_action_constraint_splitter)
 78       raise ValueError(
 79           'Given `time_step`: %r does not match expected '
---> 80           '`time_step_spec`: %r' % (time_step, batched_time_step_spec))
 81 
 82     action = random_policy.action(time_step).action



ValueError: 

Given `time_step`: TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([584.95, 582.3 , 581.7 , 582.6 , 582.9 , 584.65], 
dtype=float32),              
'reward': array(0., dtype=float32),
'step_type': array(0, dtype=int32)}) 

does not match expected

  `time_step_spec`: TimeStep(
{'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), 
name="discount", minimum=0.0, 
maximum=1.0),
'observation': BoundedArraySpec(shape=(), dtype=dtype('float32'), 
name="_observation", 
minimum=0.0, maximum=3.4028234663852886e+38),
'reward': ArraySpec(shape=(), dtype=dtype('float32'), name="reward"),
'step_type': ArraySpec(shape=(), dtype=dtype('int32'), 
name="step_type")})

Leave a Comment