From abe795e2858b2d785a1ebce0ba66661f357330da Mon Sep 17 00:00:00 2001
From: Joost van Doorn <joost.van.doorn@gmail.com>
Date: Mon, 28 Mar 2022 18:00:45 +0200
Subject: [PATCH] Fix _module_available to detect horovod.torch properly
 (#12377)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>
---
 CHANGELOG.md                                  |  1 +
 .../connectors/accelerator_connector.py       |  2 +-
 pytorch_lightning/utilities/imports.py        |  8 +--
 .../data/horovod/train_default_model.py       |  5 +-
 tests/models/test_horovod.py                  |  7 ++-
 tests/utilities/test_imports.py               | 62 ++++++++++++++++++-
 6 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 263f381a4d..f57f792b97 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267))
 
+- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377))
 
 - Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454))
 
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index ba74a13c85..be02a80e3c 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -616,7 +616,7 @@ class AcceleratorConnector:
         hvd.init()
         if isinstance(self.accelerator, GPUAccelerator):
             # Horovod assigns one local GPU per process
-            self._parallel_devices = list(range(hvd.local_size()))
+            self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())]
         else:
             self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index be3c0596e1..aadc419b77 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -53,13 +53,9 @@ def _module_available(module_path: str) -> bool:
     if not _package_available(module_names[0]):
         return False
     try:
-        module = importlib.import_module(module_names[0])
-    except ImportError:
+        importlib.import_module(module_path)
+    except ModuleNotFoundError:
         return False
-    for name in module_names[1:]:
-        if not hasattr(module, name):
-            return False
-        module = getattr(module, name)
     return True
 
 
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 4b8974a5ce..ab692d379a 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -42,9 +42,10 @@ from tests.helpers.utils import reset_seed, set_random_main_port  # noqa: E402
 parser = argparse.ArgumentParser()
 parser.add_argument("--trainer-options", required=True)
 parser.add_argument("--on-gpu", action="store_true", default=False)
+parser.add_argument("--check-size", action="store_true", default=False)
 
 
-def run_test_from_config(trainer_options, on_gpu, check_size=True):
+def run_test_from_config(trainer_options, on_gpu, check_size):
     """Trains the default model with the given config."""
     set_random_main_port()
     reset_seed()
@@ -107,4 +108,4 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
+    run_test_from_config(json.loads(args.trainer_options), args.on_gpu, args.check_size)
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 26f0d60385..f8d973d5cf 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -41,7 +41,6 @@ if _HOROVOD_AVAILABLE:
 
 
 @RunIf(min_gpus=1, horovod=True)
-@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
 def test_nccl_is_available_on_gpu_environment():
     from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
 
@@ -71,6 +70,8 @@ def _run_horovod(trainer_options):
     ]
     if trainer_options.get("accelerator", "cpu") == "gpu":
         cmdline += ["--on-gpu"]
+    if devices == 2:
+        cmdline += ["--check-size"]
     exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy())
     assert exit_code == 0
 
@@ -93,7 +94,7 @@ def test_horovod_cpu(tmpdir):
 @RunIf(horovod=True, skip_windows=True)
 def test_horovod_cpu_accumulate_grad_batches(tmpdir):
     trainer_options = dict(
-        default_root_dir=tmpdir,
+        default_root_dir=str(tmpdir),
         enable_progress_bar=False,
         max_epochs=1,
         limit_train_batches=4,
@@ -154,7 +155,7 @@ def test_horovod_multi_gpu(tmpdir):
 @RunIf(min_gpus=2, horovod_nccl=True, skip_windows=True)
 def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
     trainer_options = dict(
-        default_root_dir=tmpdir,
+        default_root_dir=str(tmpdir),
         enable_progress_bar=False,
         max_epochs=1,
         limit_train_batches=4,
diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py
index 75bcb51ffb..aa40f71da4 100644
--- a/tests/utilities/test_imports.py
+++ b/tests/utilities/test_imports.py
@@ -13,7 +13,16 @@
 # limitations under the License.
 import operator
 
-from pytorch_lightning.utilities import _module_available
+from pytorch_lightning.utilities import (
+    _APEX_AVAILABLE,
+    _BAGUA_AVAILABLE,
+    _DEEPSPEED_AVAILABLE,
+    _FAIRSCALE_AVAILABLE,
+    _HOROVOD_AVAILABLE,
+    _module_available,
+    _OMEGACONF_AVAILABLE,
+    _POPTORCH_AVAILABLE,
+)
 from pytorch_lightning.utilities.imports import _compare_version
 
 
@@ -45,3 +54,54 @@ def test_compare_version(monkeypatch):
     assert not _compare_version("torch", operator.ge, "1.10.0.rc0")
     assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
     assert not _compare_version("torch", operator.ge, "1.10.0")
+
+
+def test_imports():
+    try:
+        import apex  # noqa
+    except ModuleNotFoundError:
+        assert not _APEX_AVAILABLE
+    else:
+        assert _APEX_AVAILABLE
+
+    try:
+        import bagua  # noqa
+    except ModuleNotFoundError:
+        assert not _BAGUA_AVAILABLE
+    else:
+        assert _BAGUA_AVAILABLE
+
+    try:
+        import deepspeed  # noqa
+    except ModuleNotFoundError:
+        assert not _DEEPSPEED_AVAILABLE
+    else:
+        assert _DEEPSPEED_AVAILABLE
+
+    try:
+        import fairscale.nn  # noqa
+    except ModuleNotFoundError:
+        assert not _FAIRSCALE_AVAILABLE
+    else:
+        assert _FAIRSCALE_AVAILABLE
+
+    try:
+        import horovod.torch  # noqa
+    except ModuleNotFoundError:
+        assert not _HOROVOD_AVAILABLE
+    else:
+        assert _HOROVOD_AVAILABLE
+
+    try:
+        import omegaconf  # noqa
+    except ModuleNotFoundError:
+        assert not _OMEGACONF_AVAILABLE
+    else:
+        assert _OMEGACONF_AVAILABLE
+
+    try:
+        import poptorch  # noqa
+    except ModuleNotFoundError:
+        assert not _POPTORCH_AVAILABLE
+    else:
+        assert _POPTORCH_AVAILABLE