I have a code that I would like to be executed by several different threads and each thread should makes use of a gpu in my cluster. What I do is then is call slurm with the script below that runs daemon.py
#!/bin/bash
#SBATCH --account XXX
#SBATCH --nodelist=node5
#SBATCH -p XXX
#SBATCH -o ./results/out.%t.%j.%N.txt
#SBATCH -e ./results/out.%t.%j.%N.err
#SBATCH -J JobA
#SBATCH --get-user-env
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-core=1
#SBATCH --gres=gpu:a16:3
#SBATCH --hint=multithread
export CUDA_VISIBLE_DEVICES=all
srun python -u daemon.py
daemon.py then takes care of launching a number of threads equal to the number of gpu's available and making sure that when one thread finishes, launches another one with a different set of parameters taken from a "pool" of parameters (the global list at the beginning).
import logging
import threading
import subprocess
import time
import sys
import torch
print("GPUS",torch.cuda.device_count())
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
parameters = [
(0,),
(1,),
(2,),
(3,),
(4,),
(5,),
(6,),
(7,)
]
N_GPUS = 5 # Number of GPUs available
def launch_script(param):
logging.info(f"Launching with: {param}")
subprocess.run(["python3", "test.py"] + [str(arg) for arg in param])
logging.info(f"The script using parameters {param} is ended")
def end_daemon():
for thread in threads:
thread.join()
logging.info("Closing daemon")
exit()
if __name__ == "__main__":
param_index = 0
threads = list()
for i in range(N_GPUS):
if param_index >= len(parameters):
end_daemon()
arg = (i,)+parameter_combinations[param_index]
x = threading.Thread(target=launch_script, args=(arg,))
threads.append(x)
param_index += 1
x.start()
while True:
for thread in threads:
if not thread.is_alive():
if param_index >= len(parameters):
end_daemon()
i = threads.index(thread)
arg = (i,)+parameter_combinations[param_index]
new_thread = threading.Thread(target=launch_script, args=(arg,))
threads[i] = new_thread
new_thread.start()
param_index += 1
time.sleep(1)
test.py
import sys
import time
import torch
print("Script:", sys.argv[0])
print("Args:", sys.argv[1:])
device = torch.device(f'cuda:{sys.argv[1][0]}')
print("dev:",device)
print("cuda available:",torch.cuda.is_available())
print("n dev:",torch.cuda.device_count())
time.sleep(10)
output:
3
2024-02-16 15:49:28,760 [INFO] Launching with: (0, 0)
2024-02-16 15:49:28,760 [INFO] Launching with: (1, 1)
2024-02-16 15:49:28,761 [INFO] Launching with: (2, 2)
2024-02-16 15:49:28,762 [INFO] Launching with: (3, 3)
2024-02-16 15:49:28,763 [INFO] Launching with: (4, 4)
Script: test.py
Args: ['0', '0']
dev: cuda:0
cuda available: True
n dev: 3
Script: test.py
Args: ['4', '4']
dev: cuda:4
cuda available: True
n dev: 3
Script: test.py
Args: ['3', '3']
dev: cuda:3
cuda available: True
n dev: 3
Script: test.py
Args: ['1', '1']
dev: cuda:1
cuda available: True
n dev: 3
Script: test.py
Args: ['2', '2']
dev: cuda:2
cuda available: True
n dev: 3
2024-02-16 15:49:45,783 [INFO] The script using parameters (3, 3) is ended
2024-02-16 15:49:45,824 [INFO] The script using parameters (1, 1) is ended
2024-02-16 15:49:45,845 [INFO] The script using parameters (0, 0) is ended
2024-02-16 15:49:45,865 [INFO] The script using parameters (4, 4) is ended
2024-02-16 15:49:45,943 [INFO] The script using parameters (2, 2) is ended
2024-02-16 15:49:46,765 [INFO] Launching with: (0, 5)
2024-02-16 15:49:46,766 [INFO] Launching with: (1, 6)
2024-02-16 15:49:46,766 [INFO] Launching with: (2, 7)
Script: test.py
Args: ['1', '6']
dev: cuda:1
cuda available: True
n dev: 3
Script: test.py
Args: ['0', '5']
dev: cuda:0
cuda available: True
n dev: 3
Script: test.py
Args: ['2', '7']
dev: cuda:2
cuda available: True
n dev: 3
2024-02-16 15:50:00,646 [INFO] The script using parameters (1, 6) is ended
2024-02-16 15:50:00,707 [INFO] The script using parameters (0, 5) is ended
2024-02-16 15:50:00,919 [INFO] The script using parameters (2, 7) is ended
2024-02-16 15:50:00,919 [INFO] Closing daemon
node configuration:
NodeName=node5 Arch=x86_64 CoresPerSocket=16
CPUAlloc=2 CPUTot=64 CPULoad=0.09
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:a16:3
NodeAddr=node5 NodeHostName=node5 Version=20.11.4
OS=Linux 5.10.0-20-amd64 #1 SMP Debian 5.10.158-2 (2022-12-13)
RealMemory=773432 AllocMem=773432 FreeMem=599855 Sockets=2 Boards=1
State=MIXED ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=X
BootTime=2023-05-05T13:01:56 SlurmdStartTime=2023-10-03T17:10:05
CfgTRES=cpu=64,mem=773432M,billing=64,gres/gpu=3
AllocTRES=cpu=2,mem=773432M,gres/gpu=1
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
The problem is that those a16 gpu's are supposed to be virtualized so it's like I'm supposed to see 12 (4x3) but I keep seeing only 3. What am I doing wrong ?