I try to train a custom environment by stable-baselines3 inside a singularity images.But it is always interrupted by SIGTEM.Here are the informations:
Loguru caught a signal: SIGTERM
Loguru caught a signal:
SIGTERM
Loguru caught a signal:
Loguru caught a signal: Loguru caught a signal: SIGTERMLoguru caught a signal:
SIGTERM
SIGTERMSIGTERMLoguru caught a signal:
SIGTERM
Stack trace:
35 0x56026b2de245 _start + 37
34 0x7fc941e27e40 __libc_start_main + 128
33 0x7fc941e27d90 /lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7fc941e27d90]
32 0x56026b2de34d Py_BytesMain + 45
31 0x56026b308165 Py_RunMain + 885
30 0x56026b309241 PyRun_SimpleStringFlags + 65
29 0x56026b309391 PyRun_StringFlags + 129
28 0x56026b310d5b /usr/bin/python3(+0x25ed5b) [0x56026b310d5b]
27 0x56026b317f08 /usr/bin/python3(+0x265f08) [0x56026b317f08]
26 0x56026b2eb456 PyEval_EvalCode + 134
25 0x56026b1f3766 /usr/bin/python3(+0x141766) [0x56026b1f3766]
24 0x56026b1f93b0 _PyEval_EvalFrameDefault + 10864
23 0x56026b21d032 PyObject_Call + 290
22 0x56026b20e3ac _PyFunction_Vectorcall + 124
21 0x56026b1f7005 _PyEval_EvalFrameDefault + 1733
20 0x56026b20e3ac _PyFunction_Vectorcall + 124
19 0x56026b1fca72 _PyEval_EvalFrameDefault + 24882
18 0x56026b20e3ac _PyFunction_Vectorcall + 124
17 0x56026b1f714a _PyEval_EvalFrameDefault + 2058
16 0x56026b20e3ac _PyFunction_Vectorcall + 124
15 0x56026b1f714a _PyEval_EvalFrameDefault + 2058
14 0x56026b20e3ac _PyFunction_Vectorcall + 124
13 0x56026b1f93b0 _PyEval_EvalFrameDefault + 10864
12 0x56026b20e3ac _PyFunction_Vectorcall + 124
11 0x56026b1f714a _PyEval_EvalFrameDefault + 2058
10 0x56026b20e3ac _PyFunction_Vectorcall + 124
9 0x56026b1f714a _PyEval_EvalFrameDefault + 2058
8 0x56026b20e3ac _PyFunction_Vectorcall + 124
7 0x56026b1f714a _PyEval_EvalFrameDefault + 2058
6 0x56026b20e3ac _PyFunction_Vectorcall + 124
5 0x56026b1f7005 _PyEval_EvalFrameDefault + 1733
4 0x56026b20e609 /usr/bin/python3(+0x15c609) [0x56026b20e609]
3 0x56026b33c9e0 /usr/bin/python3(+0x28a9e0) [0x56026b33c9e0]
2 0x56026b240bb8 _Py_read + 216
1 0x7fc941f129cc read + 76
0 0x7fc941e40520 /lib/x86_64-linux-gnu/libc.so.6(+0x42520) [0x7fc941e40520]
There are 40 cpu available in my system.And 36 of them are used to train my algorithm.In fact,my custom environment is related to finite elemnt methods which means it needs lots of computing resources. I have reduced the number of cpu and learning_rate,but these didn't work.Here are the train.py:
import sys
from envs.active_gym import ActiveControl
from envs.passive_gym import PassiveControl
from stable_baselines3 import SAC, PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList, BaseCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.env_util import make_vec_env
import os
models_dir = "models/PPO_ACTIVE"
model_path = f"{models_dir}/1000.zip"
logdir = "logs"
if not os.path.exists(models_dir):
os.makedirs(models_dir)
if not os.path.exists(logdir):
os.makedirs(logdir)
if __name__ == "__main__":
env = make_vec_env(ActiveControl, n_envs=36, vec_env_cls=SubprocVecEnv)
env.reset()
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir, n_steps=1024, batch_size=32, learning_rate=1e-4)
checkpoint_callback = CheckpointCallback(
save_freq=500,
save_path="./logs/checkpoints",
name_prefix="PPO_model",
)
eval_callback = EvalCallback(env, best_model_save_path="./logs/best_model",
log_path="./logs/evaluations", eval_freq=500,
deterministic=True, render=False)
class MyCallback(BaseCallback):
def __init__(self, verbose=1):
super(MyCallback, self).__init__(verbose)
def _on_step(self) -> bool:
current_step = self.model.num_timesteps
print("progress:{}/{}".format(current_step, self.model.total_timesteps))
return True
print_callback = MyCallback()
callback_list = CallbackList([checkpoint_callback, eval_callback, print_callback])
env.reset()
model.learn(total_timesteps=30720, reset_num_timesteps=False, tb_log_name="PPO_ACTIVE", callback=callback_list)