diff --git a/setup.cfg b/setup.cfg index 79ab35616e..9f908742c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ filterwarnings = # TODO: remove in 1.7 ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.decorators ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.memory - +xfail_strict = true junit_duration_report = call diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 8460a9339f..b81a1b1fdf 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -34,13 +34,18 @@ from pytorch_lightning.utilities import ( _TPU_AVAILABLE, ) -try: - from horovod.common.util import nccl_built +_HOROVOD_NCCL_AVAILABLE = False +if _HOROVOD_AVAILABLE: + import horovod - nccl_built() - _HOROVOD_NCCL_AVAILABLE = True -except (ImportError, ModuleNotFoundError, AttributeError): - _HOROVOD_NCCL_AVAILABLE = False + try: + + # `nccl_built` returns an integer + _HOROVOD_NCCL_AVAILABLE = bool(horovod.torch.nccl_built()) + except AttributeError: + # AttributeError can be raised if MPI is not available: + # https://github.com/horovod/horovod/blob/v0.23.0/horovod/torch/__init__.py#L33-L34 + pass class RunIf: @@ -152,8 +157,7 @@ class RunIf: reasons.append("Horovod") if horovod_nccl: - # FIXME(@jirka): nccl is not available in ci - conditions.append(True) # not _HOROVOD_NCCL_AVAILABLE + conditions.append(not _HOROVOD_NCCL_AVAILABLE) reasons.append("Horovod with NCCL") if standalone: diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index c4d364ad1f..6f87b66b0c 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -39,6 +39,16 @@ if _HOROVOD_AVAILABLE: import horovod import horovod.torch as hvd + +@RunIf(min_gpus=1, horovod=True) +@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image") +def test_nccl_is_available_on_gpu_environment(): + from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE + + # the GPU environment should always install Horovod NCCL + assert _HOROVOD_NCCL_AVAILABLE + + # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), "data", "horovod", "train_default_model.py")