added slurm no process warning

This commit is contained in:
William Falcon 2019-07-18 16:47:46 -04:00
parent 53a1a6d462
commit ad44d9168b
1 changed files with 17 additions and 3 deletions

View File

@ -374,9 +374,23 @@ class Trainer(TrainerIO):
if self.use_ddp: if self.use_ddp:
# must copy only the meta of the exp so it survives pickle/unpickle when going to new process # must copy only the meta of the exp so it survives pickle/unpickle when going to new process
self.experiment = self.experiment.get_meta_copy() self.experiment = self.experiment.get_meta_copy()
task = int(os.environ['SLURM_LOCALID'])
self.ddp_train(task, model) # whenever we have the correct number of tasks, we let slurm manage processes
# mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) # otherwise we launch the required number of processes
nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
nb_requested_gpus = len(self.data_parallel_device_ids)
is_slurm_managing_tasks = nb_slurm_tasks == nb_requested_gpus
if is_slurm_managing_tasks:
task = int(os.environ['SLURM_LOCALID'])
self.ddp_train(task, model)
else:
msg = f"""
You requested {nb_requested_gpus} GPUs but launched {nb_slurm_tasks} slurm tasks.
We will launch {nb_requested_gpus} processes for you.
We recommend you let slurm manage the processes by setting: --ntasks-per-node={nb_requested_gpus}
"""
warnings.warn(msg)
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
# 1 gpu or dp option triggers training using DP module # 1 gpu or dp option triggers training using DP module
# easier to avoid NCCL issues # easier to avoid NCCL issues