From abe795e2858b2d785a1ebce0ba66661f357330da Mon Sep 17 00:00:00 2001 From: Joost van Doorn Date: Mon, 28 Mar 2022 18:00:45 +0200 Subject: [PATCH] Fix _module_available to detect horovod.torch properly (#12377) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec Co-authored-by: Jirka --- CHANGELOG.md | 1 + .../connectors/accelerator_connector.py | 2 +- pytorch_lightning/utilities/imports.py | 8 +-- .../data/horovod/train_default_model.py | 5 +- tests/models/test_horovod.py | 7 ++- tests/utilities/test_imports.py | 62 ++++++++++++++++++- 6 files changed, 72 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 263f381a4d..f57f792b97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267)) +- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377)) - Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454)) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index ba74a13c85..be02a80e3c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -616,7 +616,7 @@ class AcceleratorConnector: hvd.init() if isinstance(self.accelerator, GPUAccelerator): # Horovod assigns one local GPU per process - self._parallel_devices = list(range(hvd.local_size())) + self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())] else: self._parallel_devices = [torch.device("cpu")] * hvd.local_size() diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index be3c0596e1..aadc419b77 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -53,13 +53,9 @@ def _module_available(module_path: str) -> bool: if not _package_available(module_names[0]): return False try: - module = importlib.import_module(module_names[0]) - except ImportError: + importlib.import_module(module_path) + except ModuleNotFoundError: return False - for name in module_names[1:]: - if not hasattr(module, name): - return False - module = getattr(module, name) return True diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 4b8974a5ce..ab692d379a 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -42,9 +42,10 @@ from tests.helpers.utils import reset_seed, set_random_main_port # noqa: E402 parser = argparse.ArgumentParser() parser.add_argument("--trainer-options", required=True) parser.add_argument("--on-gpu", action="store_true", default=False) +parser.add_argument("--check-size", action="store_true", default=False) -def run_test_from_config(trainer_options, on_gpu, check_size=True): +def run_test_from_config(trainer_options, on_gpu, check_size): """Trains the default model with the given config.""" set_random_main_port() reset_seed() @@ -107,4 +108,4 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True): if __name__ == "__main__": args = parser.parse_args() - run_test_from_config(json.loads(args.trainer_options), args.on_gpu) + run_test_from_config(json.loads(args.trainer_options), args.on_gpu, args.check_size) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 26f0d60385..f8d973d5cf 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -41,7 +41,6 @@ if _HOROVOD_AVAILABLE: @RunIf(min_gpus=1, horovod=True) -@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image") def test_nccl_is_available_on_gpu_environment(): from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE @@ -71,6 +70,8 @@ def _run_horovod(trainer_options): ] if trainer_options.get("accelerator", "cpu") == "gpu": cmdline += ["--on-gpu"] + if devices == 2: + cmdline += ["--check-size"] exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 @@ -93,7 +94,7 @@ def test_horovod_cpu(tmpdir): @RunIf(horovod=True, skip_windows=True) def test_horovod_cpu_accumulate_grad_batches(tmpdir): trainer_options = dict( - default_root_dir=tmpdir, + default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=4, @@ -154,7 +155,7 @@ def test_horovod_multi_gpu(tmpdir): @RunIf(min_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir): trainer_options = dict( - default_root_dir=tmpdir, + default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=4, diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py index 75bcb51ffb..aa40f71da4 100644 --- a/tests/utilities/test_imports.py +++ b/tests/utilities/test_imports.py @@ -13,7 +13,16 @@ # limitations under the License. import operator -from pytorch_lightning.utilities import _module_available +from pytorch_lightning.utilities import ( + _APEX_AVAILABLE, + _BAGUA_AVAILABLE, + _DEEPSPEED_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _HOROVOD_AVAILABLE, + _module_available, + _OMEGACONF_AVAILABLE, + _POPTORCH_AVAILABLE, +) from pytorch_lightning.utilities.imports import _compare_version @@ -45,3 +54,54 @@ def test_compare_version(monkeypatch): assert not _compare_version("torch", operator.ge, "1.10.0.rc0") assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True) assert not _compare_version("torch", operator.ge, "1.10.0") + + +def test_imports(): + try: + import apex # noqa + except ModuleNotFoundError: + assert not _APEX_AVAILABLE + else: + assert _APEX_AVAILABLE + + try: + import bagua # noqa + except ModuleNotFoundError: + assert not _BAGUA_AVAILABLE + else: + assert _BAGUA_AVAILABLE + + try: + import deepspeed # noqa + except ModuleNotFoundError: + assert not _DEEPSPEED_AVAILABLE + else: + assert _DEEPSPEED_AVAILABLE + + try: + import fairscale.nn # noqa + except ModuleNotFoundError: + assert not _FAIRSCALE_AVAILABLE + else: + assert _FAIRSCALE_AVAILABLE + + try: + import horovod.torch # noqa + except ModuleNotFoundError: + assert not _HOROVOD_AVAILABLE + else: + assert _HOROVOD_AVAILABLE + + try: + import omegaconf # noqa + except ModuleNotFoundError: + assert not _OMEGACONF_AVAILABLE + else: + assert _OMEGACONF_AVAILABLE + + try: + import poptorch # noqa + except ModuleNotFoundError: + assert not _POPTORCH_AVAILABLE + else: + assert _POPTORCH_AVAILABLE