I have a script which computes a simulation on a cluster via LSF scheduler. In my setup the 'main.bash' executes the LSF job script 'simulator.bash'. If a thread/index fail for some reason, the jobs are requeued via brequeue until all the threads complete their tasks sucessfully.
With schedueler like slurm, we have scontrol, which can be used to modify attributes like RUNTIMELIMIT, #'s of Cores, #'s Of nodes etc. before submitting/requeue-ing a finished job.
My question is, Is there a mechanism in LSF where we can modify attributes of a completed/finished job and requeue it ?.
Contents of main.bash
#!/usr/bin/env bash
collect_job_status() {
jid=$1
bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' $jid
status=$(bjobs -noheader -w -a $jid | tr -s ' ' | cut -d ' ' -f3)
if [[ $status =~ .*EXIT.* ]]; then
echo 1
else
echo 0
fi
}
main() {
output=$(bsub <./simulator.bash)
jid=$(echo $output | cut -d'<' -f2 | cut -d'>' -f1)
echo "Submitted JobId = $jid"
echo "Submitting dummy job for waiting."
bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"
jname=$(bjobs -noheader -o 'job_name jobindex' $jid | tr -s ' ' | sort -n -k2 | tail -1 | cut -d ' ' -f1)
echo "Job NAME :: $jname"
echo "JOB EXIT STATUS for $jid :: $(collect_job_status $jid)"
while [ $(collect_job_status $jid) -ne 0 ]; do
echo "-----------------------------------------------------------------------------------------------"
echo -e "Job $jname $jid Failed.\nREQUEUING JOB..."
current_run_time=$(bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' 2610014 | tr -s ' ' | cut -d ' ' -f13 | sort -u -rn)
brequeue -J "$jname" -e $jid
echo "Submitting dummy job for waiting."
bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"
done
echo "-----------------------------------------------------------------------------------------------"
}
main
Contents of simulator.bash
#!/usr/bin/env bash
#BSUB -W 00:10
#BSUB -P myproject
#BSUB -n 1
#BSUB -o test_reque_%J_%I.out
#BSUB -e test_reque_%J_%I.err
#BSUB -cwd /opt
#BSUB -J TESTING_REQUE[1-10]
export LSB_JOB_REPORT_MAIL=N
MODEL_DIR_FOLDER=(
/path/to/model/1
/path/to/model/2
/path/to/model/3
/path/to/model/4
/path/to/model/5
/path/to/model/6
/path/to/model/7
/path/to/model/8
/path/to/model/9
/path/to/model/10
)
SIM_FILE_NAME=(
sim_file_1
sim_file_2
sim_file_3
sim_file_4
sim_file_5
sim_file_6
sim_file_7
sim_file_8
sim_file_9
sim_file_10
)
if [[ $(($RANDOM % 2)) -eq 0 ]]; then
echo >&2 "FAILING THIS JOB $LSB_JOBINDEX"
exit 127
fi
echo "JOB@@_OUTPUT $LSB_JOBINDEX :: MODEL_DIR_FOLDER :: ${MODEL_DIR_FOLDER[$((LSB_JOBINDEX - 1))]}"
echo "JOB@@_OUTPUT $LSB_JOBINDEX :: SIM_FILE_NAME :: ${SIM_FILE_NAME[$((LSB_JOBINDEX - 1))]}"
sleep 10
exit 0