Refactor Horovod NCCL check (#11948)

This commit is contained in:
Carlos Mocholí 2022-02-28 11:45:32 +01:00 committed by GitHub
parent 54b9a85227
commit 5f920dc088
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 9 deletions

View File

@ -32,7 +32,7 @@ filterwarnings =
# TODO: remove in 1.7
ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.decorators
ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.memory
xfail_strict = true
junit_duration_report = call

View File

@ -34,13 +34,18 @@ from pytorch_lightning.utilities import (
_TPU_AVAILABLE,
)
try:
from horovod.common.util import nccl_built
_HOROVOD_NCCL_AVAILABLE = False
if _HOROVOD_AVAILABLE:
import horovod
nccl_built()
_HOROVOD_NCCL_AVAILABLE = True
except (ImportError, ModuleNotFoundError, AttributeError):
_HOROVOD_NCCL_AVAILABLE = False
try:
# `nccl_built` returns an integer
_HOROVOD_NCCL_AVAILABLE = bool(horovod.torch.nccl_built())
except AttributeError:
# AttributeError can be raised if MPI is not available:
# https://github.com/horovod/horovod/blob/v0.23.0/horovod/torch/__init__.py#L33-L34
pass
class RunIf:
@ -152,8 +157,7 @@ class RunIf:
reasons.append("Horovod")
if horovod_nccl:
# FIXME(@jirka): nccl is not available in ci
conditions.append(True) # not _HOROVOD_NCCL_AVAILABLE
conditions.append(not _HOROVOD_NCCL_AVAILABLE)
reasons.append("Horovod with NCCL")
if standalone:

View File

@ -39,6 +39,16 @@ if _HOROVOD_AVAILABLE:
import horovod
import horovod.torch as hvd
@RunIf(min_gpus=1, horovod=True)
@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
def test_nccl_is_available_on_gpu_environment():
from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
# the GPU environment should always install Horovod NCCL
assert _HOROVOD_NCCL_AVAILABLE
# This script will run the actual test model training in parallel
TEST_SCRIPT = os.path.join(os.path.dirname(__file__), "data", "horovod", "train_default_model.py")