fixed correct module on hpc save

2019-07-24 18:10:30 -04:00 · 2019-07-24 18:10:30 -04:00 · 10330f1991
parent 549a158ec0
commit 10330f1991
1 changed files with 185 additions and 276 deletions
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -21,53 +21,93 @@ np.random.seed(SEED)
 # ------------------------------------------------------------------------
 # TESTS
 # ------------------------------------------------------------------------
-def test_hpc_save_load_cpu_models():
+def test_cpu_model():
    """
-    Make sure DP works
+    Make sure model trains on CPU
+    :return:
+    """
+
+    trainer_options = dict(
+        progress_bar=False,
+        experiment=get_exp(),
+        max_nb_epochs=1,
+        train_percent_check=0.4,
+        val_percent_check=0.4
+    )
+
+    model, hparams = get_model()
+
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
+
+
+def test_all_features_cpu_model():
+    """
+    Test each of the trainer options
+    :return:
+    """
+
+    trainer_options = dict(
+        gradient_clip=1.0,
+        overfit_pct=0.20,
+        track_grad_norm=2,
+        print_nan_grads=True,
+        progress_bar=False,
+        experiment=get_exp(),
+        max_nb_epochs=1,
+        train_percent_check=0.4,
+        val_percent_check=0.4
+    )
+
+    model, hparams = get_model()
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
+
+
+def test_early_stopping_cpu_model():
+    """
+    Test each of the trainer options
+    :return:
+    """
+
+    stopping = EarlyStopping()
+    trainer_options = dict(
+        early_stop_callback=stopping,
+        gradient_clip=1.0,
+        overfit_pct=0.20,
+        track_grad_norm=2,
+        print_nan_grads=True,
+        progress_bar=False,
+        experiment=get_exp(),
+        max_nb_epochs=1,
+        train_percent_check=0.4,
+        val_percent_check=0.4
+    )
+
+    model, hparams = get_model()
+    run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
+
+
+def test_single_gpu_model():
+    """
+    Make sure single GPU works (DP mode)
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
-        return
-    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
        return
    model, hparams = get_model()
+
    trainer_options = dict(
        progress_bar=False,
        max_nb_epochs=1,
        train_percent_check=0.1,
        val_percent_check=0.1,
+        gpus=[0]
    )

-    save_dir = init_save_dir()
-
-    # exp file to get meta
-    exp = get_exp(False)
-    exp.argparse(hparams)
-    exp.save()
-
-    # exp file to get weights
-    checkpoint = ModelCheckpoint(save_dir)
-
-    # add these to the trainer options
-    trainer_options['checkpoint_callback'] = checkpoint
-    trainer_options['experiment'] = exp
-
-    # fit model
-    trainer = Trainer(**trainer_options)
-    result = trainer.fit(model)
-
-    # correct result and ok accuracy
-    assert result == 1, 'amp + ddp model failed to complete'
-
-    trainer.hpc_save(save_dir, exp)
-    trainer.hpc_load(save_dir, on_gpu=True)
-
-    clear_save_dir()
+    run_gpu_model_test(trainer_options, model, hparams)


-def test_hpc_save_load_gpu_models():
+def test_multi_gpu_model_dp():
    """
    Make sure DP works
    :return:
@ -87,257 +127,122 @@ def test_hpc_save_load_gpu_models():
        gpus=[0, 1]
    )

-    save_dir = init_save_dir()
+    run_gpu_model_test(trainer_options, model, hparams)

-    # exp file to get meta
-    exp = get_exp(False)
-    exp.argparse(hparams)
+    # test memory helper functions
+    memory.get_gpu_memory_map()
+
+
+def test_amp_gpu_dp():
+    """
+    Make sure DP + AMP work
+    :return:
+    """
+    if not torch.cuda.is_available():
+        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
+        return
+    if not torch.cuda.device_count() > 1:
+        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        return
+    model, hparams = get_model()
+    trainer_options = dict(
+        max_nb_epochs=1,
+        gpus='0, 1',  # test init with gpu string
+        distributed_backend='dp',
+        use_amp=True
+    )
+    with pytest.raises(MisconfigurationException):
+        run_gpu_model_test(trainer_options, model, hparams)
+
+
+def test_multi_gpu_model_ddp():
+    """
+    Make sure DDP works
+    :return:
+    """
+    if not torch.cuda.is_available():
+        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
+        return
+    if not torch.cuda.device_count() > 1:
+        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        return
+
+    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
+    model, hparams = get_model()
+    trainer_options = dict(
+        progress_bar=False,
+        max_nb_epochs=1,
+        train_percent_check=0.1,
+        val_percent_check=0.1,
+        gpus=[0, 1],
+        distributed_backend='ddp'
+    )
+
+    run_gpu_model_test(trainer_options, model, hparams)
+
+
+def test_amp_gpu_ddp():
+    """
+    Make sure DDP + AMP work
+    :return:
+    """
+    if not torch.cuda.is_available():
+        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        return
+    if not torch.cuda.device_count() > 1:
+        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        return
+
+    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
+
+    hparams = get_hparams()
+    model = LightningTestModel(hparams)
+
+    trainer_options = dict(
+        progress_bar=True,
+        max_nb_epochs=1,
+        gpus=[0, 1],
+        distributed_backend='ddp',
+        use_amp=True
+    )
+
+    run_gpu_model_test(trainer_options, model, hparams)
+
+
+def test_ddp_sampler_error():
+    """
+    Make sure DDP + AMP work
+    :return:
+    """
+    if not torch.cuda.is_available():
+        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        return
+    if not torch.cuda.device_count() > 1:
+        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        return
+
+    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
+
+    hparams = get_hparams()
+    model = LightningTestModel(hparams, force_remove_distributed_sampler=True)
+
+    exp = get_exp(True)
    exp.save()

-    # exp file to get weights
-    checkpoint = ModelCheckpoint(save_dir)
+    trainer = Trainer(
+        experiment=exp,
+        progress_bar=False,
+        max_nb_epochs=1,
+        gpus=[0, 1],
+        distributed_backend='ddp',
+        use_amp=True
+    )

-    # add these to the trainer options
-    trainer_options['checkpoint_callback'] = checkpoint
-    trainer_options['experiment'] = exp
-
-    # fit model
-    trainer = Trainer(**trainer_options)
-    result = trainer.fit(model)
-
-    # correct result and ok accuracy
-    assert result == 1, 'amp + ddp model failed to complete'
-
-    trainer.hpc_save(save_dir, exp)
-    trainer.hpc_load(save_dir, on_gpu=True)
+    with pytest.raises(MisconfigurationException):
+        trainer.get_dataloaders(model)

    clear_save_dir()

-#
-# def test_cpu_model():
-#     """
-#     Make sure model trains on CPU
-#     :return:
-#     """
-#
-#     trainer_options = dict(
-#         progress_bar=False,
-#         experiment=get_exp(),
-#         max_nb_epochs=1,
-#         train_percent_check=0.4,
-#         val_percent_check=0.4
-#     )
-#
-#     model, hparams = get_model()
-#
-#     run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
-#
-#
-# def test_all_features_cpu_model():
-#     """
-#     Test each of the trainer options
-#     :return:
-#     """
-#
-#     trainer_options = dict(
-#         gradient_clip=1.0,
-#         overfit_pct=0.20,
-#         track_grad_norm=2,
-#         print_nan_grads=True,
-#         progress_bar=False,
-#         experiment=get_exp(),
-#         max_nb_epochs=1,
-#         train_percent_check=0.4,
-#         val_percent_check=0.4
-#     )
-#
-#     model, hparams = get_model()
-#     run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
-#
-#
-# def test_early_stopping_cpu_model():
-#     """
-#     Test each of the trainer options
-#     :return:
-#     """
-#
-#     stopping = EarlyStopping()
-#     trainer_options = dict(
-#         early_stop_callback=stopping,
-#         gradient_clip=1.0,
-#         overfit_pct=0.20,
-#         track_grad_norm=2,
-#         print_nan_grads=True,
-#         progress_bar=False,
-#         experiment=get_exp(),
-#         max_nb_epochs=1,
-#         train_percent_check=0.4,
-#         val_percent_check=0.4
-#     )
-#
-#     model, hparams = get_model()
-#     run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
-#
-#
-# def test_single_gpu_model():
-#     """
-#     Make sure single GPU works (DP mode)
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
-#         return
-#     model, hparams = get_model()
-#
-#     trainer_options = dict(
-#         progress_bar=False,
-#         max_nb_epochs=1,
-#         train_percent_check=0.1,
-#         val_percent_check=0.1,
-#         gpus=[0]
-#     )
-#
-#     run_gpu_model_test(trainer_options, model, hparams)
-#
-#
-#
-#
-# def test_multi_gpu_model_dp():
-#     """
-#     Make sure DP works
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
-#         return
-#     if not torch.cuda.device_count() > 1:
-#         warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
-#         return
-#     model, hparams = get_model()
-#     trainer_options = dict(
-#         progress_bar=False,
-#         max_nb_epochs=1,
-#         train_percent_check=0.1,
-#         val_percent_check=0.1,
-#         gpus=[0, 1]
-#     )
-#
-#     run_gpu_model_test(trainer_options, model, hparams)
-#
-#     # test memory helper functions
-#     memory.get_gpu_memory_map()
-#
-#
-# def test_amp_gpu_dp():
-#     """
-#     Make sure DP + AMP work
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
-#         return
-#     if not torch.cuda.device_count() > 1:
-#         warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
-#         return
-#     model, hparams = get_model()
-#     trainer_options = dict(
-#         max_nb_epochs=1,
-#         gpus='0, 1',  # test init with gpu string
-#         distributed_backend='dp',
-#         use_amp=True
-#     )
-#     with pytest.raises(MisconfigurationException):
-#         run_gpu_model_test(trainer_options, model, hparams)
-#
-#
-# def test_multi_gpu_model_ddp():
-#     """
-#     Make sure DDP works
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
-#         return
-#     if not torch.cuda.device_count() > 1:
-#         warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
-#         return
-#
-#     os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
-#     model, hparams = get_model()
-#     trainer_options = dict(
-#         progress_bar=False,
-#         max_nb_epochs=1,
-#         train_percent_check=0.1,
-#         val_percent_check=0.1,
-#         gpus=[0, 1],
-#         distributed_backend='ddp'
-#     )
-#
-#     run_gpu_model_test(trainer_options, model, hparams)
-#
-#
-# def test_amp_gpu_ddp():
-#     """
-#     Make sure DDP + AMP work
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
-#         return
-#     if not torch.cuda.device_count() > 1:
-#         warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
-#         return
-#
-#     os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
-#
-#     hparams = get_hparams()
-#     model = LightningTestModel(hparams)
-#
-#     trainer_options = dict(
-#         progress_bar=True,
-#         max_nb_epochs=1,
-#         gpus=[0, 1],
-#         distributed_backend='ddp',
-#         use_amp=True
-#     )
-#
-#     run_gpu_model_test(trainer_options, model, hparams)
-#
-#
-# def test_ddp_sampler_error():
-#     """
-#     Make sure DDP + AMP work
-#     :return:
-#     """
-#     if not torch.cuda.is_available():
-#         warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
-#         return
-#     if not torch.cuda.device_count() > 1:
-#         warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
-#         return
-#
-#     os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
-#
-#     hparams = get_hparams()
-#     model = LightningTestModel(hparams, force_remove_distributed_sampler=True)
-#
-#     exp = get_exp(True)
-#     exp.save()
-#
-#     trainer = Trainer(
-#         experiment=exp,
-#         progress_bar=False,
-#         max_nb_epochs=1,
-#         gpus=[0, 1],
-#         distributed_backend='ddp',
-#         use_amp=True
-#     )
-#
-#     with pytest.raises(MisconfigurationException):
-#         trainer.get_dataloaders(model)
-#
-#     clear_save_dir()
-

 # ------------------------------------------------------------------------
 # UTILS
@ -370,6 +275,10 @@ def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True):
    # test model preds
    run_prediction(model.test_dataloader, pretrained_model)

+    # test HPC loading / saving
+    trainer.hpc_save(save_dir, exp)
+    trainer.hpc_load(save_dir, on_gpu=True)
+
    clear_save_dir()