import multiprocessing as mp import os import time from typing import Any from unittest import mock from unittest.mock import PropertyMock import pytest import torch from torch.optim import Optimizer import pytorch_lightning as pl from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import HivemindStrategy from pytorch_lightning.strategies.hivemind import HiveMindScheduler from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT from tests_pytorch.helpers.runif import RunIf if _HIVEMIND_AVAILABLE: import hivemind @mock.patch("pytorch_lightning.strategies.hivemind._HIVEMIND_AVAILABLE", False) def test_raise_exception_if_hivemind_unavailable(): """Test that we raise an exception when Hivemind is not available.""" with pytest.raises(MisconfigurationException, match="you must have Hivemind installed"): HivemindStrategy(target_batch_size=1) @RunIf(hivemind=True) @mock.patch("hivemind.DHT", autospec=True) def test_strategy(mock_dht): strategy = HivemindStrategy(target_batch_size=1) trainer = pl.Trainer(strategy=strategy) assert trainer.strategy == strategy @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) def test_optimizer_wrapped(): class TestModel(BoringModel): def on_before_backward(self, loss: torch.Tensor) -> None: optimizer = self.trainer.optimizers[0] assert isinstance(optimizer, hivemind.Optimizer) model = TestModel() trainer = pl.Trainer(strategy=HivemindStrategy(target_batch_size=1), fast_dev_run=True) trainer.fit(model) @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) def test_scheduler_wrapped(): class TestModel(BoringModel): def on_before_backward(self, loss: torch.Tensor) -> None: scheduler = self.trainer.lr_scheduler_configs[0].scheduler assert isinstance(scheduler, HiveMindScheduler) def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) return [optimizer], [torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)] model = TestModel() trainer = pl.Trainer( strategy=HivemindStrategy(target_batch_size=1), fast_dev_run=True, ) trainer.fit(model) @RunIf(hivemind=True) @mock.patch.dict( os.environ, { "HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor", "PL_INITIAL_PEERS": "TEST_PEERS", }, clear=True, ) @mock.patch("hivemind.DHT", autospec=True) def test_env_variables_parsed(mock_dht): """Test that env variables are parsed correctly.""" strategy = HivemindStrategy(target_batch_size=1) assert strategy._initial_peers == ["TEST_PEERS"] @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) def test_reuse_grad_buffers_warning(): """Test to ensure we warn when a user overrides `optimizer_zero_grad` and `reuse_grad_buffers` is True.""" class TestModel(BoringModel): def on_before_backward(self, loss: torch.Tensor) -> None: optimizer = self.trainer.optimizers[0] assert isinstance(optimizer, hivemind.Optimizer) def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): pass model = TestModel() trainer = pl.Trainer(strategy=HivemindStrategy(target_batch_size=1, reuse_grad_buffers=True), fast_dev_run=True) with pytest.warns(UserWarning, match="You have overridden `optimizer_zero_grad` which will be disabled."): trainer.fit(model) @RunIf(hivemind=True) def test_raise_exception_multiple_optimizers(): """Test that we raise an exception when multiple optimizers are provided.""" class TestModel(BoringModel): def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer, optimizer], [lr_scheduler] model = TestModel() trainer = pl.Trainer(strategy=HivemindStrategy(target_batch_size=1), fast_dev_run=True) with pytest.raises(MisconfigurationException, match="Hivemind only supports training with one optimizer."): trainer.fit(model) @RunIf(hivemind=True) @mock.patch("pytorch_lightning.utilities.data._extract_batch_size", autospec=True, return_value=[None]) def test_raise_exception_no_batch_size(mock_extract_batch_size): """Test that we raise an exception when no batch size is automatically found.""" model = BoringModel() trainer = pl.Trainer(strategy=HivemindStrategy(target_batch_size=1), fast_dev_run=True) with pytest.raises(MisconfigurationException, match="Please provide the batch size to the Strategy."): trainer.fit(model) @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) @pytest.mark.parametrize( "delay_grad_averaging, delay_state_averaging, delay_optimizer_step", [(True, True, True), (False, True, False)], ) def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, delay_optimizer_step): """Test ensures that valid combination of HiveMind delay arguments warn if scheduler isn't passed in as a function.""" model = BoringModel() trainer = pl.Trainer( strategy=HivemindStrategy( target_batch_size=1, delay_grad_averaging=delay_grad_averaging, delay_state_averaging=delay_state_averaging, delay_optimizer_step=delay_optimizer_step, ), fast_dev_run=True, ) with pytest.warns(UserWarning, match="requires a `scheduler_fn` to be passed to the strategy"): trainer.fit(model) @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) @mock.patch("pytorch_lightning.strategies.hivemind.HivemindStrategy.num_peers", new_callable=PropertyMock) def test_args_passed_to_optimizer(mock_peers): """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper.""" mock_peers.return_value = 1 compression = hivemind.ScaledFloat16Compression() with mock.patch("hivemind.Optimizer", wraps=hivemind.Optimizer) as mock_optimizer: class TestModel(BoringModel): def on_before_backward(self, loss: torch.Tensor) -> None: args, kwargs = mock_optimizer.call_args mock_optimizer.assert_called() arguments = dict( delay_optimizer_step=True, delay_state_averaging=True, state_averaging_compression=compression, grad_compression=compression, offload_optimizer=True, reuse_grad_buffers=True, target_batch_size=1, ) for key, value in arguments.items(): assert key in kwargs assert value == kwargs[key] model = TestModel() trainer = pl.Trainer( strategy=HivemindStrategy( target_batch_size=1, reuse_grad_buffers=True, delay_state_averaging=True, delay_optimizer_step=True, offload_optimizer=True, grad_compression=compression, state_averaging_compression=compression, ), fast_dev_run=True, ) trainer.fit(model) # ensures that after training with `reuse_grad_buffers` we restore the hook assert model.optimizer_zero_grad is not None @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) @pytest.mark.parametrize( "host_maddrs,expected_maddrs", [(None, ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"]), (["/ip4/127.0.0.1/tcp/0"], ["/ip4/127.0.0.1/tcp/0"])], ) def test_maddrs(host_maddrs, expected_maddrs): """Test that the multiple addresses are correctly assigned.""" strategy = HivemindStrategy(target_batch_size=1, host_maddrs=host_maddrs) assert strategy.dht.kwargs["host_maddrs"] == expected_maddrs def _run_collab_training_fn(initial_peers, wait_seconds, barrier, recorded_process_peers, recorded_process_steps): recorded_peers = [] recorded_global_steps = [] class TestModel(BoringModel): def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int, unused: int = 0) -> None: time.sleep(wait_seconds) # add an additional delay to give processes time to sync recorded_peers.append(self.trainer.strategy.num_peers) recorded_global_steps.append(self.trainer.optimizers[0].local_epoch) def on_train_end(self) -> None: # wait for all processes to get to the end of training before teardown barrier.wait() model = TestModel() trainer = pl.Trainer( max_epochs=1, limit_train_batches=16, limit_val_batches=0, strategy=HivemindStrategy( delay_state_averaging=True, offload_optimizer=True, delay_optimizer_step=True, delay_grad_averaging=True, target_batch_size=8, initial_peers=initial_peers, verbose=False, ), ) trainer.fit(model) recorded_process_peers.append(recorded_peers) recorded_process_steps.append(recorded_global_steps) # TODO: check why it fails with PT 1.12 @RunIf(hivemind=True, max_torch="1.12") @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) @pytest.mark.parametrize( "num_processes, wait_seconds", [(2, 0.25)], ) def test_multiple_peers(num_processes, wait_seconds): """Test to ensure that if we have two running processes with the same peers, they connect and train successfully.""" dht_root = hivemind.DHT(start=True) barrier = mp.Barrier(num_processes) initial_peers = dht_root.get_visible_maddrs() with mp.Manager() as manager: # allows processes to return their recorded logged peers/steps recorded_process_peers = manager.list() recorded_process_steps = manager.list() processes = [ mp.Process( target=_run_collab_training_fn, kwargs=dict( initial_peers=initial_peers, wait_seconds=wait_seconds, barrier=barrier, recorded_process_peers=recorded_process_peers, recorded_process_steps=recorded_process_steps, ), ) for x in range(num_processes) ] for process in processes: process.start() for process in processes: process.join() # assert that peers increase as expected and we run at-least 1 global step. for process_peers, process_steps in zip(recorded_process_peers, recorded_process_steps): assert any(num_peer == num_processes for num_peer in process_peers) assert any(global_step > 0 for global_step in process_steps) @RunIf(hivemind=True, min_cuda_gpus=1) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) def test_scaler_updated_precision_16(): class TestModel(BoringModel): def on_fit_start(self) -> None: assert isinstance(self.trainer.precision_plugin.scaler, hivemind.GradScaler) raise SystemExit model = TestModel() trainer = pl.Trainer( strategy=HivemindStrategy(target_batch_size=1), fast_dev_run=True, precision=16, accelerator="gpu", devices=1, ) with pytest.raises(SystemExit): trainer.fit(model)