Refactor Horovod NCCL check (#11948)
This commit is contained in:
parent
54b9a85227
commit
5f920dc088
|
@ -32,7 +32,7 @@ filterwarnings =
|
|||
# TODO: remove in 1.7
|
||||
ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.decorators
|
||||
ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.memory
|
||||
|
||||
xfail_strict = true
|
||||
junit_duration_report = call
|
||||
|
||||
|
||||
|
|
|
@ -34,13 +34,18 @@ from pytorch_lightning.utilities import (
|
|||
_TPU_AVAILABLE,
|
||||
)
|
||||
|
||||
try:
|
||||
from horovod.common.util import nccl_built
|
||||
_HOROVOD_NCCL_AVAILABLE = False
|
||||
if _HOROVOD_AVAILABLE:
|
||||
import horovod
|
||||
|
||||
nccl_built()
|
||||
_HOROVOD_NCCL_AVAILABLE = True
|
||||
except (ImportError, ModuleNotFoundError, AttributeError):
|
||||
_HOROVOD_NCCL_AVAILABLE = False
|
||||
try:
|
||||
|
||||
# `nccl_built` returns an integer
|
||||
_HOROVOD_NCCL_AVAILABLE = bool(horovod.torch.nccl_built())
|
||||
except AttributeError:
|
||||
# AttributeError can be raised if MPI is not available:
|
||||
# https://github.com/horovod/horovod/blob/v0.23.0/horovod/torch/__init__.py#L33-L34
|
||||
pass
|
||||
|
||||
|
||||
class RunIf:
|
||||
|
@ -152,8 +157,7 @@ class RunIf:
|
|||
reasons.append("Horovod")
|
||||
|
||||
if horovod_nccl:
|
||||
# FIXME(@jirka): nccl is not available in ci
|
||||
conditions.append(True) # not _HOROVOD_NCCL_AVAILABLE
|
||||
conditions.append(not _HOROVOD_NCCL_AVAILABLE)
|
||||
reasons.append("Horovod with NCCL")
|
||||
|
||||
if standalone:
|
||||
|
|
|
@ -39,6 +39,16 @@ if _HOROVOD_AVAILABLE:
|
|||
import horovod
|
||||
import horovod.torch as hvd
|
||||
|
||||
|
||||
@RunIf(min_gpus=1, horovod=True)
|
||||
@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
|
||||
def test_nccl_is_available_on_gpu_environment():
|
||||
from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
|
||||
|
||||
# the GPU environment should always install Horovod NCCL
|
||||
assert _HOROVOD_NCCL_AVAILABLE
|
||||
|
||||
|
||||
# This script will run the actual test model training in parallel
|
||||
TEST_SCRIPT = os.path.join(os.path.dirname(__file__), "data", "horovod", "train_default_model.py")
|
||||
|
||||
|
|
Loading…
Reference in New Issue