lightning/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

232 lines
9.7 KiB
Python
Raw Normal View History

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from multiprocessing import Process
from unittest import mock
ruff: replace isort with ruff +TPU (#17684) * ruff: replace isort with ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing & imports * lines in warning test * docs * fix enum import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing * import * fix lines * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * type ClusterEnvironment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
from unittest.mock import ANY, Mock, call, patch
import pytest
import torch
from lightning.fabric.plugins import ClusterEnvironment
from lightning.pytorch import Trainer
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.strategies import DDPStrategy
from lightning.pytorch.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
from lightning.pytorch.trainer.states import TrainerFn
ruff: replace isort with ruff +TPU (#17684) * ruff: replace isort with ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing & imports * lines in warning test * docs * fix enum import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing * import * fix lines * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * type ClusterEnvironment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
from tests_pytorch.helpers.runif import RunIf
@mock.patch("lightning.pytorch.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[])
def test_multiprocessing_launcher_forking_on_unsupported_platform(_):
with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"):
2022-07-23 15:48:15 +00:00
_MultiProcessingLauncher(strategy=Mock(), start_method="fork")
@pytest.mark.parametrize("start_method", ["spawn", pytest.param("fork", marks=RunIf(standalone=True))])
@mock.patch("lightning.pytorch.strategies.launchers.multiprocessing.mp")
def test_multiprocessing_launcher_start_method(mp_mock, start_method):
mp_mock.get_all_start_methods.return_value = [start_method]
2022-07-23 15:48:15 +00:00
launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method)
launcher.launch(function=Mock())
mp_mock.get_context.assert_called_with(start_method)
mp_mock.start_processes.assert_called_with(
ANY,
args=ANY,
nprocs=ANY,
start_method=start_method,
join=False,
)
@pytest.mark.parametrize("start_method", ["spawn", pytest.param("fork", marks=RunIf(standalone=True))])
@mock.patch("lightning.pytorch.strategies.launchers.multiprocessing.mp")
def test_multiprocessing_launcher_restore_globals(mp_mock, start_method):
"""Test that we pass the global state snapshot to the worker function only if we are starting with 'spawn'."""
mp_mock.get_all_start_methods.return_value = [start_method]
launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method)
launcher.launch(function=Mock())
function_args = mp_mock.start_processes.call_args[1]["args"]
if start_method == "spawn":
assert len(function_args) == 6
assert isinstance(function_args[5], _GlobalStateSnapshot)
else:
assert len(function_args) == 5
def test_global_state_snapshot():
"""Test the capture() and restore() methods for the global state snapshot."""
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False
torch.manual_seed(123)
# capture the state of globals
snapshot = _GlobalStateSnapshot.capture()
# simulate there is a process boundary and flags get reset here
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.benchmark = True
torch.manual_seed(321)
# restore the state of globals
snapshot.restore()
assert torch.are_deterministic_algorithms_enabled()
assert not torch.backends.cudnn.benchmark
assert torch.initial_seed() == 123
@pytest.mark.parametrize("trainer_fn", [TrainerFn.FITTING, "other"])
@pytest.mark.parametrize("fake_node_rank", [0, 1])
@pytest.mark.parametrize("fake_local_rank", [0, 1])
def test_collect_rank_zero_results(trainer_fn, fake_node_rank, fake_local_rank, tmpdir):
"""Tests that the spawn strategy transfers the new weights to the main process and deletes the temporary file."""
model = Mock(wraps=BoringModel(), spec=BoringModel)
fake_global_rank = 2 * fake_node_rank + fake_local_rank
cluster_environment = Mock(spec=ClusterEnvironment)
cluster_environment.world_size.return_value = 4
cluster_environment.node_rank.return_value = fake_node_rank
cluster_environment.local_rank.return_value = fake_local_rank
cluster_environment.global_rank.return_value = fake_global_rank
strategy = DDPStrategy(cluster_environment=cluster_environment, start_method="spawn")
strategy._local_rank = fake_local_rank
launcher = _MultiProcessingLauncher(strategy=strategy)
trainer = Trainer(accelerator="cpu", default_root_dir=tmpdir, strategy=strategy)
assert strategy.node_rank == fake_node_rank
assert strategy.local_rank == fake_local_rank
assert strategy.global_rank == fake_global_rank
trainer.strategy.connect(model)
trainer.state.fn = trainer_fn # pretend we are in a particular trainer state
spawn_output = launcher._collect_rank_zero_results(trainer, {})
model.state_dict.assert_called_once()
is_fitting = trainer_fn == TrainerFn.FITTING
if strategy.local_rank == 0:
# on local rank 0 (each node), we expect a temp checkpoint (when fitting)
assert not is_fitting or spawn_output.weights_path.endswith(".temp.ckpt")
assert not is_fitting or os.path.isfile(spawn_output.weights_path)
assert is_fitting or spawn_output.weights_path is None
else:
# all other ranks don't have outputs (rank 0 needs to handle the output)
assert spawn_output is None
@pytest.mark.parametrize("trainer_fn", [TrainerFn.FITTING, "other"])
def test_transfer_weights(tmpdir, trainer_fn):
"""Tests that the multiprocessing launcher transfers the new weights to the main process and deletes the temporary
file."""
model = Mock(wraps=BoringModel(), spec=BoringModel)
strategy = DDPStrategy(start_method="spawn")
trainer = Trainer(accelerator="cpu", default_root_dir=tmpdir, strategy=strategy)
trainer.strategy.connect(model)
trainer.state.fn = trainer_fn # pretend we are in a particular trainer state
spawn_output = strategy._launcher._collect_rank_zero_results(trainer, {})
model.state_dict.assert_called_once()
if trainer_fn == TrainerFn.FITTING:
assert spawn_output.weights_path.endswith(".temp.ckpt")
assert os.path.isfile(spawn_output.weights_path)
else:
assert spawn_output.weights_path is None
# <-- here would normally be the multiprocessing boundary
strategy._launcher._recover_results_in_main_process(spawn_output, trainer)
assert model.load_state_dict.call_count == int(spawn_output.weights_path is not None)
def test_non_strict_loading(tmpdir):
"""Tests that the multiprocessing launcher loads the weights back into the main process but with strict loading
disabled, not erroring for missing keys."""
model = Mock(wraps=BoringModel(), spec=BoringModel)
strategy = DDPStrategy(start_method="spawn")
trainer = Trainer(accelerator="cpu", default_root_dir=tmpdir, strategy=strategy)
trainer.strategy.connect(model)
trainer.state.fn = TrainerFn.FITTING # state dict loading only relevant for the FITTING case
spawn_output = strategy._launcher._collect_rank_zero_results(trainer, {})
# <-- here would normally be the multiprocessing boundary
strategy._launcher._recover_results_in_main_process(spawn_output, trainer)
model.load_state_dict.assert_called_once_with(ANY, strict=False)
def test_kill():
launcher = _MultiProcessingLauncher(Mock())
proc0 = Mock(autospec=Process)
proc1 = Mock(autospec=Process)
launcher.procs = [proc0, proc1]
with patch("os.kill") as kill_patch:
launcher.kill(15)
assert kill_patch.mock_calls == [call(proc0.pid, 15), call(proc1.pid, 15)]
class SimpleModel(BoringModel):
def __init__(self):
super().__init__()
self.tied_layer = torch.nn.Linear(32, 2)
self.tied_layer.weight = self.layer.weight
self.register_buffer("buffer", torch.ones(3))
def on_fit_start(self) -> None:
assert not self.layer.weight.is_shared()
assert not self.tied_layer.weight.is_shared()
assert not self.buffer.is_shared()
# weights remain tied
assert self.layer.weight.data_ptr() == self.tied_layer.weight.data_ptr()
assert torch.equal(self.layer.weight.data, self.tied_layer.weight.data)
def test_memory_sharing_disabled():
"""Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race
conditions on model updates."""
model = SimpleModel()
assert not model.layer.weight.is_shared()
assert model.layer.weight.data_ptr() == model.tied_layer.weight.data_ptr()
trainer = Trainer(accelerator="cpu", devices=2, strategy="ddp_spawn", max_steps=0)
trainer.fit(model)
def test_check_for_missing_main_guard():
launcher = _MultiProcessingLauncher(strategy=Mock(), start_method="spawn")
with mock.patch(
"lightning.pytorch.strategies.launchers.multiprocessing.mp.current_process",
return_value=Mock(_inheriting=True), # pretend that main is importing itself
), pytest.raises(RuntimeError, match="requires that your script guards the main"):
launcher.launch(function=Mock())
def test_fit_twice_raises():
model = BoringModel()
trainer = Trainer(
limit_train_batches=1,
limit_test_batches=1,
num_sanity_val_steps=0,
max_epochs=1,
strategy="ddp_spawn",
barebones=True,
)
trainer.fit(model)
trainer.test(model) # make sure testing in between doesnt impact the result
trainer.fit_loop.max_epochs += 1
with pytest.raises(NotImplementedError, match=r"twice.*is not supported"):
trainer.fit(model)