From 5b645d713e9ef28b615dd3b05be1316550b4a036 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 11 Oct 2020 10:21:53 -0400 Subject: [PATCH] Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs --- setup.cfg | 6 ++ tests/backends/test_dp.py | 85 +++++++++++++++++++++++++ tests/models/test_gpu.py | 41 ------------ tests/trainer/test_trainer_test_loop.py | 32 ---------- 4 files changed, 91 insertions(+), 73 deletions(-) create mode 100644 tests/backends/test_dp.py diff --git a/setup.cfg b/setup.cfg index 93acd1e0c0..b47f349256 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,12 +38,18 @@ exclude_lines = rank_zero_warn # TODO: figure out how to get codecov to pick up the test results on these backends # The actual coverage for each is 90%+ +# *metrics (94%+) are temporarily removed from testing while tests speed up omit = pytorch_lightning/accelerators/ddp_*.py pytorch_lightning/accelerators/ddp2_*.py pytorch_lightning/accelerators/dp_*.py pytorch_lightning/accelerators/tpu_*.py pytorch_lightning/cluster_environments/*.py + pytorch_lightning/overrides/data_parallel.py + pytorch_lightning/metrics + pytorch_lightning/utilities/xla_device_utils.py + pytorch_lightning/utilities/distributed.py + pytorch_lightning/tuner/auto_gpu_select.py [flake8] # TODO: this should be 88 or 100 according PEP8 diff --git a/tests/backends/test_dp.py b/tests/backends/test_dp.py new file mode 100644 index 0000000000..a5dae873fb --- /dev/null +++ b/tests/backends/test_dp.py @@ -0,0 +1,85 @@ +import pytest +import torch + +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils +from pytorch_lightning.callbacks import EarlyStopping +from pytorch_lightning.core import memory +from tests.base import EvalModelTemplate +import pytorch_lightning as pl + + +PRETEND_N_OF_GPUS = 16 + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_early_stop_dp(tmpdir): + """Make sure DDP works. with early stopping""" + tutils.set_random_master_port() + + trainer_options = dict( + default_root_dir=tmpdir, + callbacks=[EarlyStopping()], + max_epochs=50, + limit_train_batches=10, + limit_val_batches=10, + gpus=[0, 1], + distributed_backend='dp', + ) + + model = EvalModelTemplate() + tpipes.run_model_test(trainer_options, model) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_dp(tmpdir): + tutils.set_random_master_port() + + trainer_options = dict( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=10, + limit_val_batches=10, + gpus=[0, 1], + distributed_backend='dp', + progress_bar_refresh_rate=0 + ) + + model = EvalModelTemplate() + + tpipes.run_model_test(trainer_options, model) + + # test memory helper functions + memory.get_memory_profile('min_max') + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_dp_test(tmpdir): + tutils.set_random_master_port() + + import os + os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' + + model = EvalModelTemplate() + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=10, + limit_val_batches=10, + gpus=[0, 1], + distributed_backend='dp', + ) + trainer.fit(model) + assert 'ckpt' in trainer.checkpoint_callback.best_model_path + results = trainer.test() + assert 'test_acc' in results[0] + + old_weights = model.c_d1.weight.clone().detach().cpu() + + results = trainer.test(model) + assert 'test_acc' in results[0] + + # make sure weights didn't change + new_weights = model.c_d1.weight.clone().detach().cpu() + + assert torch.all(torch.eq(old_weights, new_weights)) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b91bc1c892..f96b2ece07 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -19,25 +19,6 @@ from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator PRETEND_N_OF_GPUS = 16 -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_early_stop_dp(tmpdir): - """Make sure DDP works. with early stopping""" - tutils.set_random_master_port() - - trainer_options = dict( - default_root_dir=tmpdir, - callbacks=[EarlyStopping()], - max_epochs=50, - limit_train_batches=10, - limit_val_batches=10, - gpus=[0, 1], - distributed_backend='dp', - ) - - model = EvalModelTemplate() - tpipes.run_model_test(trainer_options, model) - - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" @@ -56,28 +37,6 @@ def test_multi_gpu_none_backend(tmpdir): tpipes.run_model_test(trainer_options, model) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_dp(tmpdir): - tutils.set_random_master_port() - - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=10, - limit_val_batches=10, - gpus=[0, 1], - distributed_backend='dp', - progress_bar_refresh_rate=0 - ) - - model = EvalModelTemplate() - - tpipes.run_model_test(trainer_options, model) - - # test memory helper functions - memory.get_memory_profile('min_max') - - @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.parametrize('gpus', [1, [0], [1]]) def test_single_gpu_model(tmpdir, gpus): diff --git a/tests/trainer/test_trainer_test_loop.py b/tests/trainer/test_trainer_test_loop.py index e8151c2c5b..79fdab8db1 100644 --- a/tests/trainer/test_trainer_test_loop.py +++ b/tests/trainer/test_trainer_test_loop.py @@ -34,38 +34,6 @@ def test_single_gpu_test(tmpdir): assert torch.all(torch.eq(old_weights, new_weights)) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_dp_test(tmpdir): - tutils.set_random_master_port() - - import os - os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' - - model = EvalModelTemplate() - trainer = pl.Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=10, - limit_val_batches=10, - gpus=[0, 1], - distributed_backend='dp', - ) - trainer.fit(model) - assert 'ckpt' in trainer.checkpoint_callback.best_model_path - results = trainer.test() - assert 'test_acc' in results[0] - - old_weights = model.c_d1.weight.clone().detach().cpu() - - results = trainer.test(model) - assert 'test_acc' in results[0] - - # make sure weights didn't change - new_weights = model.c_d1.weight.clone().detach().cpu() - - assert torch.all(torch.eq(old_weights, new_weights)) - - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_ddp_spawn_test(tmpdir): tutils.set_random_master_port()