From 506d5da68b0bc9a65c8eab61f92e7cb89c1d7288 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Mon, 9 Sep 2019 07:37:20 -0400
Subject: [PATCH] enable single gpu per node (#218)

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node

* enable single gpu per node
---
 pytorch_lightning/trainer/trainer.py | 54 ++++++++++++++++------------
 tests/test_models.py                 | 31 +++++++++++++++-
 tox.ini                              |  2 +-
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 8b9a20c8cf..9ea5c0a919 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -181,7 +181,7 @@ class Trainer(TrainerIO):
         self.proc_rank = 0
         self.world_size = 1
         self.node_rank = 0
-        self.__configure_slurm_ddp(self.data_parallel_device_ids, nb_gpu_nodes)
+        self.__configure_slurm_ddp(nb_gpu_nodes)
 
         # nvidia setup
         self.__set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids)
@@ -284,51 +284,59 @@ class Trainer(TrainerIO):
         raise MisconfigurationException(m)
 
     def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
-        # make DP and DDP mutually exclusive
-        # single GPU will also use DP with devices=[0]
-        requested_gpus = self.data_parallel_device_ids is not None
+        # skip for CPU
+        if self.num_gpus == 0:
+            return
 
-        num_gpus = self.num_gpus
-        if num_gpus > 0:
-            # single GPU case
-            if num_gpus == 1:
-                self.single_gpu = True
+        # single GPU case
+        if self.num_gpus == 1:
+            self.single_gpu = True
 
-            elif num_gpus > 1 and distributed_backend is not None:
-                # DP, DDP case
+            if distributed_backend is not None:
                 self.use_dp = distributed_backend == 'dp'
                 self.use_ddp = distributed_backend == 'ddp'
 
-                # use ddp automatically if nb_gpu_nodes > 1
-                if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
-                    self.use_ddp = True
-                    self.use_dp = False
-                    w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
-                        'Switching to DistributedDataParallel for you. ' \
-                        'To silence this warning set distributed_backend=ddp'
-                    warnings.warn(w)
+        # multiple GPU case
+        elif self.num_gpus > 1:
+            if distributed_backend is not None:
+                # DP, DDP case
+                self.use_dp = distributed_backend == 'dp'
+                self.use_ddp = distributed_backend == 'ddp'
 
             elif distributed_backend is None:
                 m = 'When using multiple GPUs set ' \
                     'Trainer(distributed_backend=dp) (or ddp)'
                 raise MisconfigurationException(m)
 
+        # use ddp automatically if nb_gpu_nodes > 1
+        if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
+            self.use_ddp = True
+            self.use_dp = False
+            w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
+                'Switching to DistributedDataParallel for you. ' \
+                'To silence this warning set distributed_backend=ddp'
+            warnings.warn(w)
+
         print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
 
-    def __configure_slurm_ddp(self, gpu_ids, nb_gpu_nodes):
+    def __configure_slurm_ddp(self, nb_gpu_nodes):
         self.is_slurm_managing_tasks = False
 
-        nb_gpus = len(gpu_ids) if type(gpu_ids) is list else gpu_ids
-
         # extract SLURM flag vars
         # whenever we have the correct number of tasks, we let slurm manage processes
         # otherwise we launch the required number of processes
         if self.use_ddp:
-            self.nb_requested_gpus = nb_gpus * nb_gpu_nodes
+            self.nb_requested_gpus = self.num_gpus * nb_gpu_nodes
             self.nb_slurm_tasks = 0
             try:
                 self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                 self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
+
+                # in interactive mode we don't manage tasks
+                job_name = os.environ['SLURM_JOB_NAME']
+                if job_name == 'bash':
+                    self.is_slurm_managing_tasks = False
+
             except Exception:
                 # likely not on slurm, so set the slurm managed flag to false
                 self.is_slurm_managing_tasks = False
diff --git a/tests/test_models.py b/tests/test_models.py
index eefc70ba39..d0214ea321 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -586,13 +586,42 @@ def test_amp_single_gpu():
         show_progress_bar=True,
         max_nb_epochs=1,
         gpus=1,
-        distributed_backend='dp',
+        distributed_backend='ddp',
         use_amp=True
     )
 
     run_gpu_model_test(trainer_options, model, hparams)
 
 
+def test_no_amp_single_gpu():
+    """
+    Make sure DDP + AMP work
+    :return:
+    """
+    if not torch.cuda.is_available():
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a GPU node to run this test')
+        return
+    if not torch.cuda.device_count() > 1:
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a node with 2+ GPUs to run this test')
+        return
+
+    hparams = get_hparams()
+    model = LightningTestModel(hparams)
+
+    trainer_options = dict(
+        show_progress_bar=True,
+        max_nb_epochs=1,
+        gpus=1,
+        distributed_backend='dp',
+        use_amp=True
+    )
+
+    with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
+        run_gpu_model_test(trainer_options, model, hparams)
+
+
 def test_cpu_restore_training():
     """
     Verify continue training session on CPU
diff --git a/tox.ini b/tox.ini
index f0ae2f5456..007b0a4783 100644
--- a/tox.ini
+++ b/tox.ini
@@ -34,8 +34,8 @@ deps =
 commands =
     check-manifest --ignore tox.ini
     python setup.py check -m -s
-    coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
     flake8 .
+    coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
 
 [flake8]
 exclude = .tox,*.egg,build,temp,examples/*