Fix tests failing on a single GPU (#11753)

Co-authored-by: akihiro@grid.ai <akihiro@grid.ai> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
2022-04-10 13:23:16 +09:00 · 2022-04-10 13:23:16 +09:00 · 568710f2f2
parent c233731b7c
commit 568710f2f2
8 changed files with 41 additions and 16 deletions
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@ -80,7 +80,7 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
    # todo: make it work also with strict loading
    qmodel2 = RegressionModel.load_from_checkpoint(model_path, strict=False)
    quant2_score = torch.mean(torch.tensor([mape(qmodel2(x), y) for x, y in dm.test_dataloader()]))
-    assert torch.allclose(org_score, quant2_score, atol=0.45)
+    assert torch.allclose(org_score, quant2_score, atol=0.47)

    # test without and with QAT callback
    trainer_args.update(max_epochs=curr_epoch + 1)
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@ -31,6 +31,7 @@ from pytorch_lightning.utilities import (
    _IPU_AVAILABLE,
    _OMEGACONF_AVAILABLE,
    _RICH_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_10,
    _TORCH_QUANTIZE_AVAILABLE,
    _TPU_AVAILABLE,
 )
@ -67,6 +68,7 @@ class RunIf:
        min_python: Optional[str] = None,
        quantization: bool = False,
        amp_apex: bool = False,
+        bf16_cuda: bool = False,
        tpu: bool = False,
        ipu: bool = False,
        hpu: bool = False,
@ -93,6 +95,7 @@ class RunIf:
            min_python: Require that Python is greater or equal than this version.
            quantization: Require that `torch.quantization` is available.
            amp_apex: Require that NVIDIA/apex is installed.
+            bf16_cuda: Require that CUDA device supports bf16.
            tpu: Require that TPU is available.
            ipu: Require that IPU is available.
            hpu: Require that HPU is available.
@ -141,6 +144,20 @@ class RunIf:
            conditions.append(not _APEX_AVAILABLE)
            reasons.append("NVIDIA Apex")

+        if bf16_cuda:
+            try:
+                cond = not (torch.cuda.is_available() and _TORCH_GREATER_EQUAL_1_10 and torch.cuda.is_bf16_supported())
+            except (AssertionError, RuntimeError) as e:
+                # AssertionError: Torch not compiled with CUDA enabled
+                # RuntimeError: Found no NVIDIA driver on your system.
+                is_unrelated = "Found no NVIDIA driver" not in str(e) or "Torch not compiled with CUDA" not in str(e)
+                if is_unrelated:
+                    raise e
+                cond = True
+
+            conditions.append(cond)
+            reasons.append("CUDA device bf16")
+
        if skip_windows:
            conditions.append(sys.platform == "win32")
            reasons.append("unimplemented on Windows")
--- a/tests/lite/test_parity.py
+++ b/tests/lite/test_parity.py
@ -111,7 +111,7 @@ def precision_context(precision, accelerator) -> Generator[None, None, None]:
        pytest.param(32, None, 1, "cpu"),
        pytest.param(32, None, 1, "gpu", marks=RunIf(min_gpus=1)),
        pytest.param(16, None, 1, "gpu", marks=RunIf(min_gpus=1)),
-        pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10")),
+        pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10", bf16_cuda=True)),
    ],
 )
 def test_boring_lite_model_single_device(precision, strategy, devices, accelerator, tmpdir):
--- a/tests/lite/test_wrappers.py
+++ b/tests/lite/test_wrappers.py
@ -45,9 +45,9 @@ def test_lite_module_wraps():
        (16, torch.float32, torch.float16),
        (16, torch.float64, torch.float16),
        (16, torch.long, torch.long),
-        pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10")),
-        pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10")),
-        pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10")),
+        pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
+        pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
+        pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
    ],
 )
 def test_lite_module_forward_conversion(precision, input_type, expected_type):
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@ -262,7 +262,7 @@ def test_horovod_gather(tmpdir):
    _run_horovod(trainer_options)


-@RunIf(min_gpus=1, horovod_nccl=True, skip_windows=True)
+@RunIf(min_gpus=1, skip_windows=True, horovod=True, horovod_nccl=True)
 def test_horovod_transfer_batch_to_gpu(tmpdir):
    class TestTrainingStepModel(BoringModel):
        def training_step(self, batch, *args, **kwargs):
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@ -395,7 +395,7 @@ class LoggingSyncDistModel(BoringModel):
        return super().validation_step(batch, batch_idx)


-@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))])
+@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(min_gpus=2, skip_windows=True))])
 def test_logging_sync_dist_true(tmpdir, devices):
    """Tests to ensure that the sync_dist flag works (should just return the original value)"""
    fake_result = 1
--- a/tests/trainer/test_supporters.py
+++ b/tests/trainer/test_supporters.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import os
 from typing import Sequence
 from unittest import mock
@ -36,6 +37,7 @@ from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForw
 from pytorch_lightning.utilities.data import get_len
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf


 def test_tensor_running_accum_reset():
@ -381,11 +383,12 @@ def test_combined_data_loader_validation_test(
    apply_to_collection(dataloader.loaders, DataLoader, _assert_dataset)


+@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
@pytest.mark.parametrize("replace_sampler_ddp", [False, True])
-def test_combined_data_loader_with_max_size_cycle_and_ddp(replace_sampler_ddp):
+def test_combined_data_loader_with_max_size_cycle_and_ddp(accelerator, replace_sampler_ddp):
    """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader
    with ddp and `max_size_cycle` mode."""
-    trainer = Trainer(strategy="ddp", accelerator="auto", devices=2, replace_sampler_ddp=replace_sampler_ddp)
+    trainer = Trainer(strategy="ddp", accelerator=accelerator, devices=2, replace_sampler_ddp=replace_sampler_ddp)

    dataloader = CombinedLoader(
        {"a": DataLoader(RandomDataset(32, 8), batch_size=1), "b": DataLoader(RandomDataset(32, 8), batch_size=1)},
@ -452,19 +455,23 @@ def test_combined_dataloader_for_training_with_ddp(
    }
    if use_combined_loader:
        dataloader = CombinedLoader(dataloader, mode=mode)
-    expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
-    expected_length_after_ddp = expected_length_before_ddp // 2 if replace_sampler_ddp else expected_length_before_ddp
    model = BoringModel()
    trainer = Trainer(
        strategy="ddp",
        accelerator="auto",
-        devices=2,
+        devices="auto",
        replace_sampler_ddp=replace_sampler_ddp,
        multiple_trainloader_mode="max_size_cycle" if use_combined_loader else mode,
    )
    trainer._data_connector.attach_data(
        model=model, train_dataloaders=dataloader, val_dataloaders=None, datamodule=None
    )
+    expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
+    expected_length_after_ddp = (
+        math.ceil(expected_length_before_ddp / trainer.num_devices)
+        if replace_sampler_ddp
+        else expected_length_before_ddp
+    )
    trainer.reset_train_dataloader(model=model)
    assert trainer.train_dataloader is not None
    assert isinstance(trainer.train_dataloader, CombinedLoader)
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -987,7 +987,7 @@ def test_gradient_clipping_by_norm(tmpdir, precision):
            # test that gradient is clipped correctly
            parameters = self.parameters()
            grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2)
-            torch.testing.assert_allclose(grad_norm, torch.tensor(0.05))
+            torch.testing.assert_allclose(grad_norm, torch.tensor(0.05, device=self.device))
            self.assertion_called = True

    model = TestModel()
@ -1018,7 +1018,7 @@ def test_gradient_clipping_by_value(tmpdir, precision):
            parameters = self.parameters()
            grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
            grad_max = torch.max(torch.stack(grad_max_list))
-            torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10))
+            torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10, device=self.device))
            self.assertion_called = True

    model = TestModel()
@ -1406,8 +1406,9 @@ def test_trainer_predict_1_gpu(tmpdir):


@RunIf(skip_windows=True)
-def test_trainer_predict_ddp_spawn(tmpdir):
-    predict(tmpdir, strategy="ddp_spawn", accelerator="auto", devices=2)
+@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
+def test_trainer_predict_ddp_spawn(tmpdir, accelerator):
+    predict(tmpdir, strategy="ddp_spawn", accelerator=accelerator, devices=2)


@pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])