i trying training tf agent in my environment
but have problome when running PyDriver.run
code is here
env spec code
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float64, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(120,), dtype=np.float64, minimum=0, maximum=1, name='observation')
print(time_step)
> TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
'observation': <tf.Tensor: shape=(1, 120), dtype=float64, numpy=
array([[0.17898333, 0.73180428, 0.90454012, 0.68073447, 0.69538306,
0. , 0. , 0. , 0.59775008, 0.69606781,
0.41753748, 0.45289883, 0.99998333, 0.56987768, 0.97186544,
0.49617737, 0.73455657, 0.64861079, 0.80041813, 0.71556155,
0.64580592, 0.88711717, 0.43394742, 0.59295095, 0.99998333,
0.42415902, 0.84189602, 0.3824159 , 0.56972477, 0.41574846,
0.25581329, 0.50284396, 0.5894032 , 0.84072555, 0.34313801,
0.70594989, 0.99998333, 0.39449541, 0.69908257, 0.33302752,
0.42186544, 0.46702841, 0.57208744, 0.49632389, 0.5086307 ,
0.93144464, 0.37292618, 0.7363057 , 0.99998333, 0.43746177,
0.91727829, 0.25152905, 0.3793578 , 0.51416658, 0.52656553,
0.5631547 , 0.5403153 , 1. , 0.24251103, 0.27722734,
0.99998333, 0.31238532, 1. , 0.29816514, 0.43761468,
0.75773855, 1. , 0.753052 , 0.43961091, 0.6729494 ,
0. , 0.23295147, 0.99998333, 0.46100917, 0.46100917,
0.31238532, 0.31238532, 0.2425697 , 0.24285381, 0.37733244,
0.55076502, 0.60620017, 0.32501226, 0.41949841, 0.99998333,
0.16819572, 0.54174312, 0.16819572, 0.46100917, 0.64363478,
0.66984831, 0.70089137, 0.4827128 , 0.94337797, 0.09306706,
0.70081534, 0.99998333, 0.38272171, 0.41284404, 0. ,
0.16819572, 0.82348638, 0.64830812, 0.80847267, 0.31156952,
0.50762443, 0.14433522, 0.49824548, 0.99998333, 0.55642202,
0.55642202, 0.17599388, 0.38287462, 1. , 0.74092326,
1. , 0.45567252, 0.65255309, 0.30949248, 0.33390097]])>,
'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})
code executed
env = myEnv()
train_env = tf_py_environment.TFPyEnvironment(myEnv())
eval_env = tf_py_environment.TFPyEnvironment(myEnv())
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
# print(f'num_actions: {num_actions}')
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)
# Reset the train step.
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
# Reset the environment.
time_step = train_env.reset()
# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
env,
py_tf_eager_policy.PyTFEagerPolicy(
agent.collect_policy, use_tf_function=True),
[rb_observer],
max_steps=collect_steps_per_iteration)
for _ in range(num_iterations):
# Collect a few steps and save to the replay buffer.
time_step, _ = collect_driver.run(time_step)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = agent.train(experience).loss
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
But when I run this code, an error occurs. when first repeat
time_step, _ = collect_driver.run(time_step)
error
ValueError Traceback (most recent call last)
<ipython-input-13-82e17be2955c> in <cell line: 61>()
62 print(f'do while')
63 # Collect a few steps and save to the replay buffer.
---> 64 time_step, _ = collect_driver.run(time_step)
65
66 # Sample a batch of data from the buffer and update the agent's network.
6 frames
/usr/local/lib/python3.10/dist-packages/tf_agents/drivers/py_driver.py in run(self, time_step, policy_state)
116 policy_state = self._policy.get_initial_state(self.env.batch_size or 1)
117
--> 118 action_step = self.policy.action(time_step, policy_state)
119 next_time_step = self.env.step(action_step.action)
120
/usr/local/lib/python3.10/dist-packages/tf_agents/policies/py_policy.py in action(self, time_step, policy_state, seed)
159 return self._action(time_step, policy_state, seed=seed)
160 else:
--> 161 return self._action(time_step, policy_state)
162
163 @property
/usr/local/lib/python3.10/dist-packages/tf_agents/policies/py_tf_eager_policy.py in _action(self, time_step, policy_state, seed)
102 policy_step = self._policy_action_fn(time_step, policy_state, seed=seed)
103 else:
--> 104 policy_step = self._policy_action_fn(time_step, policy_state)
105 if not self._batch_time_steps:
106 return policy_step
/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/traceback_utils.py in error_handler(*args, **kwargs)
151 except Exception as e:
152 filtered_tb = _process_traceback_frames(e.__traceback__)
--> 153 raise e.with_traceback(filtered_tb) from None
154 finally:
155 del filtered_tb
/usr/local/lib/python3.10/dist-packages/tf_agents/policies/tf_policy.py in action(self, time_step, policy_state, seed)
322 if self._automatic_state_reset:
323 policy_state = self._maybe_reset_state(time_step, policy_state)
--> 324 step = action_fn(time_step=time_step, policy_state=policy_state, seed=seed)
325
326 def clip_action(action, action_spec):
/usr/local/lib/python3.10/dist-packages/tf_agents/utils/common.py in with_check_resource_vars(*fn_args, **fn_kwargs)
186 # We're either in eager mode or in tf.function mode (no in-between); so
187 # autodep-like behavior is already expected of fn.
--> 188 return fn(*fn_args, **fn_kwargs)
189 if not resource_variables_enabled():
190 raise RuntimeError(MISSING_RESOURCE_VARIABLES_ERROR)
/usr/local/lib/python3.10/dist-packages/tf_agents/policies/epsilon_greedy_policy.py in _action(self, time_step, policy_state, seed)
128 outer_ndims = int(outer_shape.shape[0])
129 if outer_ndims >= 2:
--> 130 raise ValueError(
131 'Only supports batched time steps with a single batch dimension')
132 action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r),
ValueError: Only supports batched time steps with a single batch dimension