added test for no dist sampler

2019-07-24 16:57:21 -04:00 · 2019-07-24 16:57:21 -04:00 · 8064a77aa7
parent 5c21683566
commit 8064a77aa7
5 changed files with 61 additions and 28 deletions
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@ -19,7 +19,7 @@ import tqdm
 from pytorch_lightning.root_module.memory import get_gpu_memory_map
 from pytorch_lightning.root_module.model_saving import TrainerIO
 from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
-from pytorch_lightning.utils.debugging import IncompatibleArgumentsException
+from pytorch_lightning.utils.debugging import MisconfigurationException

 try:
    from apex import amp
@ -392,7 +392,7 @@ class Trainer(TrainerIO):
            dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
            dataloader = Dataloader(dataset, sampler=dist_sampler)
            '''
-            raise Exception(msg)
+            raise MisconfigurationException(msg)

    # -----------------------------
    # MODEL TRAINING
@ -467,7 +467,7 @@ class Trainer(TrainerIO):
            m = f'amp level {self.amp_level} with DataParallel is not supported. ' \
                f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \
                f'We recommend you switch to ddp if you want to use amp'
-            raise IncompatibleArgumentsException(m)
+            raise MisconfigurationException(m)

        # run through amp wrapper
        if self.use_amp:
--- a/pytorch_lightning/testing_models/init.py
+++ b/pytorch_lightning/testing_models/init.py
--- a/pytorch_lightning/testing_models/lm_test_module.py
+++ b/pytorch_lightning/testing_models/lm_test_module.py
--- a/pytorch_lightning/utils/debugging.py
+++ b/pytorch_lightning/utils/debugging.py
@ -1,5 +1,5 @@
 import pdb
 import sys

-class IncompatibleArgumentsException(Exception):
+class MisconfigurationException(Exception):
    pass
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -1,10 +1,11 @@
 import pytest
 from pytorch_lightning import Trainer
 from pytorch_lightning.examples.new_project_templates.lightning_module_template import LightningTemplateModel
+from pytorch_lightning.testing_models.lm_test_module import LightningTestModel
 from argparse import Namespace
 from test_tube import Experiment
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
-from pytorch_lightning.utils.debugging import IncompatibleArgumentsException
+from pytorch_lightning.utils.debugging import MisconfigurationException
 import numpy as np
 import warnings
 import torch
@ -33,7 +34,8 @@ def test_cpu_model():
        val_percent_check=0.4
    )

-    run_gpu_model_test(trainer_options, on_gpu=False)
+    model, hparams = get_model()
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)


 def test_all_features_cpu_model():
@ -54,7 +56,8 @@ def test_all_features_cpu_model():
        val_percent_check=0.4
    )

-    run_gpu_model_test(trainer_options, on_gpu=False)
+    model, hparams = get_model()
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)


 def test_early_stopping_cpu_model():
@ -77,7 +80,8 @@ def test_early_stopping_cpu_model():
        val_percent_check=0.4
    )

-    run_gpu_model_test(trainer_options, on_gpu=False)
+    model, hparams = get_model()
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)


 def test_single_gpu_model():
@ -88,6 +92,7 @@ def test_single_gpu_model():
    if not torch.cuda.is_available():
        warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
        return
+    model, hparams = get_model()

    trainer_options = dict(
        progress_bar=False,
@ -97,7 +102,7 @@ def test_single_gpu_model():
        gpus=[0]
    )

-    run_gpu_model_test(trainer_options)
+    run_gpu_model_test(trainer_options, model, hparams)


 def test_multi_gpu_model_dp():
@ -111,7 +116,7 @@ def test_multi_gpu_model_dp():
    if not torch.cuda.device_count() > 1:
        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
        return
-
+    model, hparams = get_model()
    trainer_options = dict(
        progress_bar=False,
        max_nb_epochs=1,
@ -120,7 +125,7 @@ def test_multi_gpu_model_dp():
        gpus=[0, 1]
    )

-    run_gpu_model_test(trainer_options)
+    run_gpu_model_test(trainer_options, model, hparams)


 def test_amp_gpu_dp():
@ -134,15 +139,15 @@ def test_amp_gpu_dp():
    if not torch.cuda.device_count() > 1:
        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
        return
-
+    model, hparams = get_model()
    trainer_options = dict(
        max_nb_epochs=1,
        gpus='0, 1',  # test init with gpu string
        distributed_backend='dp',
        use_amp=True
    )
-    with pytest.raises(IncompatibleArgumentsException):
-        run_gpu_model_test(trainer_options)
+    with pytest.raises(MisconfigurationException):
+        run_gpu_model_test(trainer_options, model, hparams)


 def test_multi_gpu_model_ddp():
@ -158,7 +163,7 @@ def test_multi_gpu_model_ddp():
        return

    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
-
+    model, hparams = get_model()
    trainer_options = dict(
        progress_bar=False,
        max_nb_epochs=1,
@ -168,7 +173,7 @@ def test_multi_gpu_model_ddp():
        distributed_backend='ddp'
    )

-    run_gpu_model_test(trainer_options)
+    run_gpu_model_test(trainer_options, model, hparams)


 def test_amp_gpu_ddp():
@ -185,6 +190,7 @@ def test_amp_gpu_ddp():

    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])

+    model, hparams = get_model()
    trainer_options = dict(
        progress_bar=True,
        max_nb_epochs=1,
@ -193,18 +199,14 @@ def test_amp_gpu_ddp():
        use_amp=True
    )

-    run_gpu_model_test(trainer_options)
+    run_gpu_model_test(trainer_options, model, hparams)


-# ------------------------------------------------------------------------
-# UTILS
-# ------------------------------------------------------------------------
-
-def run_gpu_model_test(trainer_options, on_gpu=True):
+def test_ddp_sampler_error():
+    """
+    Make sure DDP + AMP work
+    :return:
    """
-        Make sure DDP + AMP work
-        :return:
-        """
    if not torch.cuda.is_available():
        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
        return
@ -212,8 +214,34 @@ def run_gpu_model_test(trainer_options, on_gpu=True):
        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
        return

+    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
+
+    hparams = get_hparams()
+    model = LightningTestModel(hparams, force_remove_distributed_sampler=True)
+
+    trainer_options = dict(
+        progress_bar=True,
+        max_nb_epochs=1,
+        gpus=[0, 1],
+        distributed_backend='ddp',
+        use_amp=True
+    )
+
+    with pytest.raises(MisconfigurationException):
+        run_gpu_model_test(trainer_options, model, hparams)
+
+
+# ------------------------------------------------------------------------
+# UTILS
+# ------------------------------------------------------------------------
+
+def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True):
+    """
+        Make sure DDP + AMP work
+        :return:
+        """
+
    save_dir = init_save_dir()
-    model, hparams = get_model()

    # exp file to get meta
    exp = get_exp(False)
@ -243,8 +271,7 @@ def run_gpu_model_test(trainer_options, on_gpu=True):
    clear_save_dir()


-def get_model():
-    # set up model with these hyperparams
+def get_hparams():
    root_dir = os.path.dirname(os.path.realpath(__file__))
    hparams = Namespace(**{'drop_prob': 0.2,
                           'batch_size': 32,
@ -254,6 +281,12 @@ def get_model():
                           'data_root': os.path.join(root_dir, 'mnist'),
                           'out_features': 10,
                           'hidden_dim': 1000})
+    return hparams
+
+
+def get_model():
+    # set up model with these hyperparams
+    hparams = get_hparams()
    model = LightningTemplateModel(hparams)

    return model, hparams