enable single gpu per node (#218)

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node
This commit is contained in:
William Falcon 2019-09-09 07:37:20 -04:00 committed by GitHub
parent a6fe6f0917
commit 506d5da68b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 62 additions and 25 deletions

View File

@ -181,7 +181,7 @@ class Trainer(TrainerIO):
self.proc_rank = 0
self.world_size = 1
self.node_rank = 0
self.__configure_slurm_ddp(self.data_parallel_device_ids, nb_gpu_nodes)
self.__configure_slurm_ddp(nb_gpu_nodes)
# nvidia setup
self.__set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids)
@ -284,51 +284,59 @@ class Trainer(TrainerIO):
raise MisconfigurationException(m)
def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
# make DP and DDP mutually exclusive
# single GPU will also use DP with devices=[0]
requested_gpus = self.data_parallel_device_ids is not None
# skip for CPU
if self.num_gpus == 0:
return
num_gpus = self.num_gpus
if num_gpus > 0:
# single GPU case
if num_gpus == 1:
self.single_gpu = True
# single GPU case
if self.num_gpus == 1:
self.single_gpu = True
elif num_gpus > 1 and distributed_backend is not None:
# DP, DDP case
if distributed_backend is not None:
self.use_dp = distributed_backend == 'dp'
self.use_ddp = distributed_backend == 'ddp'
# use ddp automatically if nb_gpu_nodes > 1
if nb_gpu_nodes > 1 and self.use_dp: # pragma: no cover
self.use_ddp = True
self.use_dp = False
w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
'Switching to DistributedDataParallel for you. ' \
'To silence this warning set distributed_backend=ddp'
warnings.warn(w)
# multiple GPU case
elif self.num_gpus > 1:
if distributed_backend is not None:
# DP, DDP case
self.use_dp = distributed_backend == 'dp'
self.use_ddp = distributed_backend == 'ddp'
elif distributed_backend is None:
m = 'When using multiple GPUs set ' \
'Trainer(distributed_backend=dp) (or ddp)'
raise MisconfigurationException(m)
# use ddp automatically if nb_gpu_nodes > 1
if nb_gpu_nodes > 1 and self.use_dp: # pragma: no cover
self.use_ddp = True
self.use_dp = False
w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
'Switching to DistributedDataParallel for you. ' \
'To silence this warning set distributed_backend=ddp'
warnings.warn(w)
print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
def __configure_slurm_ddp(self, gpu_ids, nb_gpu_nodes):
def __configure_slurm_ddp(self, nb_gpu_nodes):
self.is_slurm_managing_tasks = False
nb_gpus = len(gpu_ids) if type(gpu_ids) is list else gpu_ids
# extract SLURM flag vars
# whenever we have the correct number of tasks, we let slurm manage processes
# otherwise we launch the required number of processes
if self.use_ddp:
self.nb_requested_gpus = nb_gpus * nb_gpu_nodes
self.nb_requested_gpus = self.num_gpus * nb_gpu_nodes
self.nb_slurm_tasks = 0
try:
self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
# in interactive mode we don't manage tasks
job_name = os.environ['SLURM_JOB_NAME']
if job_name == 'bash':
self.is_slurm_managing_tasks = False
except Exception:
# likely not on slurm, so set the slurm managed flag to false
self.is_slurm_managing_tasks = False

View File

@ -586,13 +586,42 @@ def test_amp_single_gpu():
show_progress_bar=True,
max_nb_epochs=1,
gpus=1,
distributed_backend='dp',
distributed_backend='ddp',
use_amp=True
)
run_gpu_model_test(trainer_options, model, hparams)
def test_no_amp_single_gpu():
"""
Make sure DDP + AMP work
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_amp_gpu_ddp cannot run.'
'Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_amp_gpu_ddp cannot run.'
'Rerun on a node with 2+ GPUs to run this test')
return
hparams = get_hparams()
model = LightningTestModel(hparams)
trainer_options = dict(
show_progress_bar=True,
max_nb_epochs=1,
gpus=1,
distributed_backend='dp',
use_amp=True
)
with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
run_gpu_model_test(trainer_options, model, hparams)
def test_cpu_restore_training():
"""
Verify continue training session on CPU

View File

@ -34,8 +34,8 @@ deps =
commands =
check-manifest --ignore tox.ini
python setup.py check -m -s
coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
flake8 .
coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
[flake8]
exclude = .tox,*.egg,build,temp,examples/*