During training, nan values are produced by the algorithm. These nan values are produced in the neural network. I found several ideas in issues that were proposed, I tried all of them but still got the error. The solutions were: changing np.float64 to np.float32, which doesn't work. Using use_expln=True, which MaskablePPO doesn't have. I also changed the model's parameter such as gamma, but still got the same error. Tried to decrease learning rate that again faced the error
class custom(gym.Env):
.
.
.
env = custom()
def mask_fn(env: gym.Env) -> List[bool]:
return env.valid_action_mask()
env = ActionMasker(env, mask_fn)
model = MaskablePPO(MaskableActorCriticPolicy, env, gamma=0.001, verbose=0)
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='logs',
name_prefix='rl_model')
model.learn(500000, callback=checkpoint_callback)
model.save("JOM")
the error is:
ValueError Traceback (most recent call last)
<ipython-input-9-abee064644f3> in <cell line: 3>()
1 checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='logs',
2 name_prefix='rl_model')
----> 3 model.learn(500000, callback=checkpoint_callback)
4 model.save("JOM")
8 frames
/usr/local/lib/python3.10/dist-packages/sb3_contrib/ppo_mask/ppo_mask.py in learn(self, total_timesteps, callback, log_interval, tb_log_name, reset_num_timesteps, use_masking, progress_bar)
545 self.logger.dump(step=self.num_timesteps)
546
--> 547 self.train()
548
549 callback.on_training_end()
/usr/local/lib/python3.10/dist-packages/sb3_contrib/ppo_mask/ppo_mask.py in train(self)
410 actions = rollout_data.actions.long().flatten()
411
--> 412 values, log_prob, entropy = self.policy.evaluate_actions(
413 rollout_data.observations,
414 actions,
/usr/local/lib/python3.10/dist-packages/sb3_contrib/common/maskable/policies.py in evaluate_actions(self, obs, actions, action_masks)
331 latent_vf = self.mlp_extractor.forward_critic(vf_features)
332
--> 333 distribution = self._get_action_dist_from_latent(latent_pi)
334 if action_masks is not None:
335 distribution.apply_masking(action_masks)
/usr/local/lib/python3.10/dist-packages/sb3_contrib/common/maskable/policies.py in _get_action_dist_from_latent(self, latent_pi)
244 """
245 action_logits = self.action_net(latent_pi)
--> 246 return self.action_dist.proba_distribution(action_logits=action_logits)
247
248 def _predict(
/usr/local/lib/python3.10/dist-packages/sb3_contrib/common/maskable/distributions.py in proba_distribution(self, action_logits)
192 reshaped_logits = action_logits.view(-1, sum(self.action_dims))
193
--> 194 self.distributions = [
195 MaskableCategorical(logits=split) for split in th.split(reshaped_logits, tuple(self.action_dims), dim=1)
196 ]
/usr/local/lib/python3.10/dist-packages/sb3_contrib/common/maskable/distributions.py in <listcomp>(.0)
193
194 self.distributions = [
--> 195 MaskableCategorical(logits=split) for split in th.split(reshaped_logits, tuple(self.action_dims), dim=1)
196 ]
197 return self
/usr/local/lib/python3.10/dist-packages/sb3_contrib/common/maskable/distributions.py in __init__(self, probs, logits, validate_args, masks)
40 ):
41 self.masks: Optional[th.Tensor] = None
---> 42 super().__init__(probs, logits, validate_args)
43 self._original_logits = self.logits
44 self.apply_masking(masks)
/usr/local/lib/python3.10/dist-packages/torch/distributions/categorical.py in __init__(self, probs, logits, validate_args)
68 self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
69 )
---> 70 super().__init__(batch_shape, validate_args=validate_args)
71
72 def expand(self, batch_shape, _instance=None):
/usr/local/lib/python3.10/dist-packages/torch/distributions/distribution.py in __init__(self, batch_shape, event_shape, validate_args)
66 valid = constraint.check(value)
67 if not valid.all():
---> 68 raise ValueError(
69 f"Expected parameter {param} "
70 f"({type(value).__name__} of shape {tuple(value.shape)}) "
ValueError: Expected parameter logits (Tensor of shape (64, 2)) of distribution MaskableCategorical(logits: torch.Size([64, 2])) to satisfy the constraint IndependentConstraint(Real(), 1), but found invalid values:
tensor([[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan],
[nan, nan]], grad_fn=<SubBackward0>)
During training, nan values are produced by the algorithm. These nan values are produced in the neural network. I found several ideas in issues that were proposed, I tried all of them but still got the error. The solutions were: changing np.float64 to np.float32, which doesn't work. Using use_expln=True, which MaskablePPO doesn't have. I also changed the model's parameter such as gamma, but still got the same error. Tried to decrease learning rate that again faced the error
Try to normalize your data inputs and out puts. can you give me more details about the data that you are using and the preprossesing you did before training.