Fix _module_available to detect horovod.torch properly (#12377)
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz>
This commit is contained in:
parent
31be799a95
commit
abe795e285
|
@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267))
|
||||
|
||||
- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377))
|
||||
|
||||
- Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454))
|
||||
|
||||
|
|
|
@ -616,7 +616,7 @@ class AcceleratorConnector:
|
|||
hvd.init()
|
||||
if isinstance(self.accelerator, GPUAccelerator):
|
||||
# Horovod assigns one local GPU per process
|
||||
self._parallel_devices = list(range(hvd.local_size()))
|
||||
self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())]
|
||||
else:
|
||||
self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
|
||||
|
||||
|
|
|
@ -53,13 +53,9 @@ def _module_available(module_path: str) -> bool:
|
|||
if not _package_available(module_names[0]):
|
||||
return False
|
||||
try:
|
||||
module = importlib.import_module(module_names[0])
|
||||
except ImportError:
|
||||
importlib.import_module(module_path)
|
||||
except ModuleNotFoundError:
|
||||
return False
|
||||
for name in module_names[1:]:
|
||||
if not hasattr(module, name):
|
||||
return False
|
||||
module = getattr(module, name)
|
||||
return True
|
||||
|
||||
|
||||
|
|
|
@ -42,9 +42,10 @@ from tests.helpers.utils import reset_seed, set_random_main_port # noqa: E402
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--trainer-options", required=True)
|
||||
parser.add_argument("--on-gpu", action="store_true", default=False)
|
||||
parser.add_argument("--check-size", action="store_true", default=False)
|
||||
|
||||
|
||||
def run_test_from_config(trainer_options, on_gpu, check_size=True):
|
||||
def run_test_from_config(trainer_options, on_gpu, check_size):
|
||||
"""Trains the default model with the given config."""
|
||||
set_random_main_port()
|
||||
reset_seed()
|
||||
|
@ -107,4 +108,4 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True):
|
|||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
|
||||
run_test_from_config(json.loads(args.trainer_options), args.on_gpu, args.check_size)
|
||||
|
|
|
@ -41,7 +41,6 @@ if _HOROVOD_AVAILABLE:
|
|||
|
||||
|
||||
@RunIf(min_gpus=1, horovod=True)
|
||||
@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
|
||||
def test_nccl_is_available_on_gpu_environment():
|
||||
from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
|
||||
|
||||
|
@ -71,6 +70,8 @@ def _run_horovod(trainer_options):
|
|||
]
|
||||
if trainer_options.get("accelerator", "cpu") == "gpu":
|
||||
cmdline += ["--on-gpu"]
|
||||
if devices == 2:
|
||||
cmdline += ["--check-size"]
|
||||
exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy())
|
||||
assert exit_code == 0
|
||||
|
||||
|
@ -93,7 +94,7 @@ def test_horovod_cpu(tmpdir):
|
|||
@RunIf(horovod=True, skip_windows=True)
|
||||
def test_horovod_cpu_accumulate_grad_batches(tmpdir):
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
default_root_dir=str(tmpdir),
|
||||
enable_progress_bar=False,
|
||||
max_epochs=1,
|
||||
limit_train_batches=4,
|
||||
|
@ -154,7 +155,7 @@ def test_horovod_multi_gpu(tmpdir):
|
|||
@RunIf(min_gpus=2, horovod_nccl=True, skip_windows=True)
|
||||
def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
default_root_dir=str(tmpdir),
|
||||
enable_progress_bar=False,
|
||||
max_epochs=1,
|
||||
limit_train_batches=4,
|
||||
|
|
|
@ -13,7 +13,16 @@
|
|||
# limitations under the License.
|
||||
import operator
|
||||
|
||||
from pytorch_lightning.utilities import _module_available
|
||||
from pytorch_lightning.utilities import (
|
||||
_APEX_AVAILABLE,
|
||||
_BAGUA_AVAILABLE,
|
||||
_DEEPSPEED_AVAILABLE,
|
||||
_FAIRSCALE_AVAILABLE,
|
||||
_HOROVOD_AVAILABLE,
|
||||
_module_available,
|
||||
_OMEGACONF_AVAILABLE,
|
||||
_POPTORCH_AVAILABLE,
|
||||
)
|
||||
from pytorch_lightning.utilities.imports import _compare_version
|
||||
|
||||
|
||||
|
@ -45,3 +54,54 @@ def test_compare_version(monkeypatch):
|
|||
assert not _compare_version("torch", operator.ge, "1.10.0.rc0")
|
||||
assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
|
||||
assert not _compare_version("torch", operator.ge, "1.10.0")
|
||||
|
||||
|
||||
def test_imports():
|
||||
try:
|
||||
import apex # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _APEX_AVAILABLE
|
||||
else:
|
||||
assert _APEX_AVAILABLE
|
||||
|
||||
try:
|
||||
import bagua # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _BAGUA_AVAILABLE
|
||||
else:
|
||||
assert _BAGUA_AVAILABLE
|
||||
|
||||
try:
|
||||
import deepspeed # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _DEEPSPEED_AVAILABLE
|
||||
else:
|
||||
assert _DEEPSPEED_AVAILABLE
|
||||
|
||||
try:
|
||||
import fairscale.nn # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _FAIRSCALE_AVAILABLE
|
||||
else:
|
||||
assert _FAIRSCALE_AVAILABLE
|
||||
|
||||
try:
|
||||
import horovod.torch # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _HOROVOD_AVAILABLE
|
||||
else:
|
||||
assert _HOROVOD_AVAILABLE
|
||||
|
||||
try:
|
||||
import omegaconf # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _OMEGACONF_AVAILABLE
|
||||
else:
|
||||
assert _OMEGACONF_AVAILABLE
|
||||
|
||||
try:
|
||||
import poptorch # noqa
|
||||
except ModuleNotFoundError:
|
||||
assert not _POPTORCH_AVAILABLE
|
||||
else:
|
||||
assert _POPTORCH_AVAILABLE
|
||||
|
|
Loading…
Reference in New Issue