# Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import shlex import subprocess import sys from unittest.mock import patch import numpy as np import pytest import torch from torch import optim from torchmetrics.classification.accuracy import Accuracy import tests_pytorch.helpers.pipelines as tpipes from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.utilities import _HOROVOD_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.advanced_models import BasicGAN from tests_pytorch.helpers.runif import RunIf if _HOROVOD_AVAILABLE: import horovod import horovod.torch as hvd @RunIf(min_cuda_gpus=1, horovod=True) def test_nccl_is_available_on_gpu_environment(): from tests_pytorch.helpers.runif import _HOROVOD_NCCL_AVAILABLE # the GPU environment should always install Horovod NCCL assert _HOROVOD_NCCL_AVAILABLE # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), "data", "horovod", "train_default_model.py") def _run_horovod(trainer_options): """Execute the training script across multiple workers in parallel.""" devices = trainer_options.get("devices", 1) # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, cmdline = [ "horovodrun", "-np", str(devices), sys.executable, TEST_SCRIPT, "--trainer-options", shlex.quote(json.dumps(trainer_options)), ] if trainer_options.get("accelerator", "cpu") == "gpu": cmdline += ["--on-gpu"] if devices == 2: cmdline += ["--check-size"] exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 @RunIf(horovod=True, skip_windows=True) def test_horovod_cpu(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True) def test_horovod_cpu_accumulate_grad_batches(tmpdir): trainer_options = dict( default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=4, limit_val_batches=0, accumulate_grad_batches=2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True) def test_horovod_cpu_clip_grad_by_value(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, gradient_clip_algorithm="value", enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True) def test_horovod_cpu_implicit(tmpdir): """Test Horovod without specifying a backend, inferring from env set by `horovodrun`.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, ) _run_horovod(trainer_options) @RunIf(min_cuda_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_multi_gpu(tmpdir): """Test Horovod with multi-GPU support.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(min_cuda_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir): trainer_options = dict( default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=4, limit_val_batches=0, accumulate_grad_batches=2, accelerator="gpu", devices=2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(horovod=True, skip_windows=True, min_cuda_gpus=2) def test_horovod_raises_unsupported_accumulate_grad_batches(tmpdir): """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod Strategy on multi-gpus.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, accumulate_grad_batches={0: 4, 2: 2}, accelerator="auto", devices=2, strategy="horovod", ) with pytest.raises(MisconfigurationException, match="Horovod.*does not support.*accumulate_grad_batches"): trainer.fit(model) @RunIf(min_cuda_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_multi_gpu_grad_by_value(tmpdir): """Test Horovod with multi-GPU support.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, gradient_clip_algorithm="value", enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", ) _run_horovod(trainer_options) # todo: need to be fixed :] # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994 # Check with (tgaddair) on Horovod issues if this feature is needed @pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex") @RunIf(min_cuda_gpus=2, amp_apex=True, horovod_nccl=True, skip_windows=True) def test_horovod_apex(tmpdir): """Test Horovod with multi-GPU support using apex amp.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", amp_backend="apex", precision=16, ) _run_horovod(trainer_options) @RunIf(min_cuda_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_amp(tmpdir): """Test Horovod with multi-GPU support using native amp.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", amp_backend="native", precision=16, ) _run_horovod(trainer_options) @RunIf(min_cuda_gpus=2, horovod_nccl=True, skip_windows=True) def test_horovod_gather(tmpdir): """Test Horovod with multi-GPU support using native amp.""" trainer_options = dict( default_root_dir=str(tmpdir), gradient_clip_val=1.0, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", ) _run_horovod(trainer_options) @RunIf(min_cuda_gpus=2, skip_windows=True, horovod=True, horovod_nccl=True) def test_horovod_transfer_batch_to_gpu(tmpdir): class TestTrainingStepModel(BoringModel): def training_step(self, batch, *args, **kwargs): assert str(batch.device) != "cpu" return super().training_step(batch, *args, **kwargs) def validation_step(self, batch, *args, **kwargs): assert str(batch.device) != "cpu" return super().validation_step(batch, *args, **kwargs) model = TestTrainingStepModel() trainer_options = dict( default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="gpu", devices=2, strategy="horovod", ) tpipes.run_model_test_without_loggers(trainer_options, model) @RunIf(horovod=True, skip_windows=True) def test_horovod_multi_optimizer(tmpdir): model = BasicGAN() # fit model trainer = Trainer( default_root_dir=str(tmpdir), enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, strategy="horovod", ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" assert len(trainer.optimizers) == 2 for i, optimizer in enumerate(trainer.optimizers): assert hasattr(optimizer, "synchronize"), "optimizer has not been wrapped into DistributedOptimizer" def get_model_params(model): return set(list(model.parameters())) def get_optimizer_params(optimizer): return {p for group in optimizer.param_groups for p in group.get("params", [])} assert get_model_params(model.generator) != get_model_params(model.discriminator) assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) # todo: need to be fixed :] @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(horovod=True, skip_windows=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, "..", "..")) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, reduce_fx="sum", on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert ( res["test_tensor"].item() == hvd.size() ), "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, enable_model_summary=False, logger=False, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2) # todo: need to be fixed :] @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(horovod=True, skip_windows=True, num_gpus=2, sklearn=True) def test_accuracy_metric_horovod(): from sklearn.metrics import accuracy_score num_batches = 10 batch_size = 16 threshold = 0.5 def sk_metric(preds, target): sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8) sk_target = target.view(-1).numpy() return accuracy_score(y_true=sk_target, y_pred=sk_preds) preds = torch.rand(num_batches, batch_size) target = torch.randint(high=2, size=(num_batches, batch_size)) def _compute_batch(): trainer = Trainer(fast_dev_run=True, strategy="horovod", logger=False) assert isinstance(trainer.accelerator, CPUAccelerator) # TODO: test that we selected the correct strategy based on horovod flags metric = Accuracy( compute_on_step=True, dist_sync_on_step=True, dist_sync_fn=trainer.strategy.all_gather, threshold=threshold, ) for i in range(hvd.rank(), num_batches, hvd.size()): batch_result = metric(preds[i], target[i]) if hvd.rank() == 0: dist_preds = torch.stack([preds[i + r] for r in range(hvd.size())]) dist_target = torch.stack([target[i + r] for r in range(hvd.size())]) sk_batch_result = sk_metric(dist_preds, dist_target) assert np.allclose(batch_result.numpy(), sk_batch_result) # check on all batches on all ranks result = metric.compute() assert isinstance(result, torch.Tensor) total_preds = torch.stack([preds[i] for i in range(num_batches)]) total_target = torch.stack([target[i] for i in range(num_batches)]) sk_result = sk_metric(total_preds, total_target) assert np.allclose(result.numpy(), sk_result) horovod.run(_compute_batch, np=2) @RunIf(horovod=True, skip_windows=True) def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx): return super().training_step(batch, batch_idx) def configure_optimizers(self): optimizer1 = optim.Adam(self.parameters(), lr=0.1) optimizer2 = optim.Adam(self.parameters(), lr=0.1) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] model = TestModel() model.training_epoch_end = None num_workers = 8 init_lr = 0.1 * num_workers with patch("horovod.torch.size", return_value=8): # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5, limit_train_batches=0.2, strategy="horovod" ) trainer.fit(model) adjusted_lr1 = [pg["lr"] for pg in trainer.optimizers[0].param_groups][0] adjusted_lr2 = [pg["lr"] for pg in trainer.optimizers[1].param_groups][0] # Called ones after end of epoch with gamma=0.1 assert pytest.approx(init_lr * 0.1) == adjusted_lr1 # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times with gamma=0.1 assert pytest.approx(init_lr * 0.1) == adjusted_lr2