PyDriver.run ValueError: Only supports batched time steps with a single batch dimension

92 Views Asked by At

i trying training tf agent in my environment

but have problome when running PyDriver.run

code is here

env spec code

self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.float64, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
        shape=(120,), dtype=np.float64, minimum=0, maximum=1, name='observation')
print(time_step)

> TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 120), dtype=float64, numpy=
array([[0.17898333, 0.73180428, 0.90454012, 0.68073447, 0.69538306,
        0.        , 0.        , 0.        , 0.59775008, 0.69606781,
        0.41753748, 0.45289883, 0.99998333, 0.56987768, 0.97186544,
        0.49617737, 0.73455657, 0.64861079, 0.80041813, 0.71556155,
        0.64580592, 0.88711717, 0.43394742, 0.59295095, 0.99998333,
        0.42415902, 0.84189602, 0.3824159 , 0.56972477, 0.41574846,
        0.25581329, 0.50284396, 0.5894032 , 0.84072555, 0.34313801,
        0.70594989, 0.99998333, 0.39449541, 0.69908257, 0.33302752,
        0.42186544, 0.46702841, 0.57208744, 0.49632389, 0.5086307 ,
        0.93144464, 0.37292618, 0.7363057 , 0.99998333, 0.43746177,
        0.91727829, 0.25152905, 0.3793578 , 0.51416658, 0.52656553,
        0.5631547 , 0.5403153 , 1.        , 0.24251103, 0.27722734,
        0.99998333, 0.31238532, 1.        , 0.29816514, 0.43761468,
        0.75773855, 1.        , 0.753052  , 0.43961091, 0.6729494 ,
        0.        , 0.23295147, 0.99998333, 0.46100917, 0.46100917,
        0.31238532, 0.31238532, 0.2425697 , 0.24285381, 0.37733244,
        0.55076502, 0.60620017, 0.32501226, 0.41949841, 0.99998333,
        0.16819572, 0.54174312, 0.16819572, 0.46100917, 0.64363478,
        0.66984831, 0.70089137, 0.4827128 , 0.94337797, 0.09306706,
        0.70081534, 0.99998333, 0.38272171, 0.41284404, 0.        ,
        0.16819572, 0.82348638, 0.64830812, 0.80847267, 0.31156952,
        0.50762443, 0.14433522, 0.49824548, 0.99998333, 0.55642202,
        0.55642202, 0.17599388, 0.38287462, 1.        , 0.74092326,
        1.        , 0.45567252, 0.65255309, 0.30949248, 0.33390097]])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})

code executed

env = myEnv()
train_env = tf_py_environment.TFPyEnvironment(myEnv())
eval_env = tf_py_environment.TFPyEnvironment(myEnv())

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.

action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
# print(f'num_actions: {num_actions}')

def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.

dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step.
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = train_env.reset()

# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
      agent.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=collect_steps_per_iteration)

for _ in range(num_iterations):

  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

But when I run this code, an error occurs. when first repeat

time_step, _ = collect_driver.run(time_step)

error

ValueError                                Traceback (most recent call last)
<ipython-input-13-82e17be2955c> in <cell line: 61>()
     62   print(f'do while')
     63   # Collect a few steps and save to the replay buffer.
---> 64   time_step, _ = collect_driver.run(time_step)
     65 
     66   # Sample a batch of data from the buffer and update the agent's network.

6 frames
/usr/local/lib/python3.10/dist-packages/tf_agents/drivers/py_driver.py in run(self, time_step, policy_state)
    116         policy_state = self._policy.get_initial_state(self.env.batch_size or 1)
    117 
--> 118       action_step = self.policy.action(time_step, policy_state)
    119       next_time_step = self.env.step(action_step.action)
    120 

/usr/local/lib/python3.10/dist-packages/tf_agents/policies/py_policy.py in action(self, time_step, policy_state, seed)
    159       return self._action(time_step, policy_state, seed=seed)
    160     else:
--> 161       return self._action(time_step, policy_state)
    162 
    163   @property

/usr/local/lib/python3.10/dist-packages/tf_agents/policies/py_tf_eager_policy.py in _action(self, time_step, policy_state, seed)
    102       policy_step = self._policy_action_fn(time_step, policy_state, seed=seed)
    103     else:
--> 104       policy_step = self._policy_action_fn(time_step, policy_state)
    105     if not self._batch_time_steps:
    106       return policy_step

/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/traceback_utils.py in error_handler(*args, **kwargs)
    151     except Exception as e:
    152       filtered_tb = _process_traceback_frames(e.__traceback__)
--> 153       raise e.with_traceback(filtered_tb) from None
    154     finally:
    155       del filtered_tb

/usr/local/lib/python3.10/dist-packages/tf_agents/policies/tf_policy.py in action(self, time_step, policy_state, seed)
    322     if self._automatic_state_reset:
    323       policy_state = self._maybe_reset_state(time_step, policy_state)
--> 324     step = action_fn(time_step=time_step, policy_state=policy_state, seed=seed)
    325 
    326     def clip_action(action, action_spec):

/usr/local/lib/python3.10/dist-packages/tf_agents/utils/common.py in with_check_resource_vars(*fn_args, **fn_kwargs)
    186         # We're either in eager mode or in tf.function mode (no in-between); so
    187         # autodep-like behavior is already expected of fn.
--> 188         return fn(*fn_args, **fn_kwargs)
    189       if not resource_variables_enabled():
    190         raise RuntimeError(MISSING_RESOURCE_VARIABLES_ERROR)

/usr/local/lib/python3.10/dist-packages/tf_agents/policies/epsilon_greedy_policy.py in _action(self, time_step, policy_state, seed)
    128     outer_ndims = int(outer_shape.shape[0])
    129     if outer_ndims >= 2:
--> 130       raise ValueError(
    131           'Only supports batched time steps with a single batch dimension')
    132     action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r),

ValueError: Only supports batched time steps with a single batch dimension
0

There are 0 best solutions below