2022-11-02 14:56:22 +00:00
|
|
|
# Copyright The Lightning AI team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2023-01-25 09:52:03 +00:00
|
|
|
import contextlib
|
2022-11-02 14:56:22 +00:00
|
|
|
import os
|
2023-10-25 00:51:11 +00:00
|
|
|
import subprocess
|
2024-01-26 16:44:24 +00:00
|
|
|
import sys
|
2023-01-25 09:52:03 +00:00
|
|
|
from io import StringIO
|
2022-11-02 14:56:22 +00:00
|
|
|
from unittest import mock
|
|
|
|
from unittest.mock import Mock
|
|
|
|
|
|
|
|
import pytest
|
2024-03-04 13:01:33 +00:00
|
|
|
from lightning.fabric.cli import _consolidate, _get_supported_strategies, _run
|
ruff: replace isort with ruff +TPU (#17684)
* ruff: replace isort with ruff
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fixing & imports
* lines in warning test
* docs
* fix enum import
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fixing
* import
* fix lines
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* type ClusterEnvironment
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
|
|
|
|
2023-03-03 16:55:48 +00:00
|
|
|
from tests_fabric.helpers.runif import RunIf
|
2022-11-02 14:56:22 +00:00
|
|
|
|
|
|
|
|
2023-05-04 15:50:39 +00:00
|
|
|
@pytest.fixture()
|
2022-11-10 02:59:40 +00:00
|
|
|
def fake_script(tmp_path):
|
|
|
|
script = tmp_path / "script.py"
|
|
|
|
script.touch()
|
|
|
|
return str(script)
|
|
|
|
|
|
|
|
|
2022-11-02 14:56:22 +00:00
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_defaults(monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_CLI_USED"] == "1"
|
2023-02-20 13:45:06 +00:00
|
|
|
assert "LT_ACCELERATOR" not in os.environ
|
2022-11-02 14:56:22 +00:00
|
|
|
assert "LT_STRATEGY" not in os.environ
|
|
|
|
assert os.environ["LT_DEVICES"] == "1"
|
|
|
|
assert os.environ["LT_NUM_NODES"] == "1"
|
2023-02-20 13:45:06 +00:00
|
|
|
assert "LT_PRECISION" not in os.environ
|
2022-11-02 14:56:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("accelerator", ["cpu", "gpu", "cuda", pytest.param("mps", marks=RunIf(mps=True))])
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2023-02-01 20:34:38 +00:00
|
|
|
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_accelerator(_, accelerator, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--accelerator", accelerator])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_ACCELERATOR"] == accelerator
|
|
|
|
|
|
|
|
|
2023-01-25 09:52:03 +00:00
|
|
|
@pytest.mark.parametrize("strategy", _get_supported_strategies())
|
2022-11-02 14:56:22 +00:00
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2023-02-01 20:34:38 +00:00
|
|
|
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_strategy(_, strategy, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--strategy", strategy])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_STRATEGY"] == strategy
|
|
|
|
|
|
|
|
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_get_supported_strategies():
|
2023-08-09 14:44:20 +00:00
|
|
|
"""Test to ensure that when new strategies get added, we must consider updating the list of supported ones in the
|
|
|
|
CLI."""
|
2024-08-07 14:47:36 +00:00
|
|
|
assert len(_get_supported_strategies()) == 8
|
2023-10-04 18:30:44 +00:00
|
|
|
assert "fsdp" in _get_supported_strategies()
|
2024-08-07 14:47:36 +00:00
|
|
|
assert "ddp_find_unused_parameters_true" in _get_supported_strategies()
|
2023-01-25 09:52:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("strategy", ["ddp_spawn", "ddp_fork", "ddp_notebook", "deepspeed_stage_3_offload"])
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_unsupported_strategy(strategy, fake_script):
|
2023-01-25 09:52:03 +00:00
|
|
|
ioerr = StringIO()
|
|
|
|
with pytest.raises(SystemExit) as e, contextlib.redirect_stderr(ioerr):
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--strategy", strategy])
|
2023-01-25 09:52:03 +00:00
|
|
|
assert e.value.code == 2
|
|
|
|
assert f"Invalid value for '--strategy': '{strategy}'" in ioerr.getvalue()
|
|
|
|
|
|
|
|
|
2022-11-02 14:56:22 +00:00
|
|
|
@pytest.mark.parametrize("devices", ["1", "2", "0,", "1,0", "-1"])
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2023-02-01 20:34:38 +00:00
|
|
|
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_devices_cuda(_, devices, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--accelerator", "cuda", "--devices", devices])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_DEVICES"] == devices
|
|
|
|
|
|
|
|
|
|
|
|
@RunIf(mps=True)
|
|
|
|
@pytest.mark.parametrize("accelerator", ["mps", "gpu"])
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_devices_mps(accelerator, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--accelerator", accelerator])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_DEVICES"] == "1"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("num_nodes", ["1", "2", "3"])
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_num_nodes(num_nodes, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--num-nodes", num_nodes])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_NUM_NODES"] == num_nodes
|
|
|
|
|
|
|
|
|
2023-02-17 10:41:18 +00:00
|
|
|
@pytest.mark.parametrize("precision", ["64-true", "64", "32-true", "32", "16-mixed", "bf16-mixed"])
|
2022-11-02 14:56:22 +00:00
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_env_vars_precision(precision, monkeypatch, fake_script):
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", Mock())
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--precision", precision])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2022-11-02 14:56:22 +00:00
|
|
|
assert os.environ["LT_PRECISION"] == precision
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_torchrun_defaults(monkeypatch, fake_script):
|
2022-11-02 14:56:22 +00:00
|
|
|
torchrun_mock = Mock()
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", torchrun_mock)
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2024-02-15 18:39:17 +00:00
|
|
|
torchrun_mock.main.assert_called_with([
|
|
|
|
"--nproc_per_node=1",
|
|
|
|
"--nnodes=1",
|
|
|
|
"--node_rank=0",
|
|
|
|
"--master_addr=127.0.0.1",
|
|
|
|
"--master_port=29400",
|
|
|
|
fake_script,
|
|
|
|
])
|
2022-11-02 14:56:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-05-04 15:50:39 +00:00
|
|
|
("devices", "expected"),
|
2022-11-02 14:56:22 +00:00
|
|
|
[
|
|
|
|
("1", 1),
|
|
|
|
("2", 2),
|
|
|
|
("0,", 1),
|
|
|
|
("1,0,2", 3),
|
|
|
|
("-1", 5),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
@mock.patch.dict(os.environ, os.environ.copy(), clear=True)
|
2023-02-01 20:34:38 +00:00
|
|
|
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=5)
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_torchrun_num_processes_launched(_, devices, expected, monkeypatch, fake_script):
|
2022-11-02 14:56:22 +00:00
|
|
|
torchrun_mock = Mock()
|
2024-01-26 16:44:24 +00:00
|
|
|
monkeypatch.setitem(sys.modules, "torch.distributed.run", torchrun_mock)
|
2022-11-10 02:59:40 +00:00
|
|
|
with pytest.raises(SystemExit) as e:
|
2024-02-27 16:36:46 +00:00
|
|
|
_run.main([fake_script, "--accelerator", "cuda", "--devices", devices])
|
2022-11-10 02:59:40 +00:00
|
|
|
assert e.value.code == 0
|
2024-02-15 18:39:17 +00:00
|
|
|
torchrun_mock.main.assert_called_with([
|
|
|
|
f"--nproc_per_node={expected}",
|
|
|
|
"--nnodes=1",
|
|
|
|
"--node_rank=0",
|
|
|
|
"--master_addr=127.0.0.1",
|
|
|
|
"--master_port=29400",
|
|
|
|
fake_script,
|
|
|
|
])
|
2023-10-25 00:51:11 +00:00
|
|
|
|
|
|
|
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_through_fabric_entry_point():
|
2024-02-27 16:36:46 +00:00
|
|
|
result = subprocess.run("fabric run --help", capture_output=True, text=True, shell=True)
|
2024-02-12 16:22:53 +00:00
|
|
|
|
2024-02-27 16:36:46 +00:00
|
|
|
message = "Usage: fabric run [OPTIONS] SCRIPT [SCRIPT_ARGS]"
|
2024-02-12 16:22:53 +00:00
|
|
|
assert message in result.stdout or message in result.stderr
|
|
|
|
|
2024-02-14 22:49:11 +00:00
|
|
|
|
2023-10-25 00:51:11 +00:00
|
|
|
@pytest.mark.skipif("lightning.fabric" == "lightning_fabric", reason="standalone package")
|
2024-03-04 13:01:33 +00:00
|
|
|
def test_run_through_lightning_entry_point():
|
2023-10-25 00:51:11 +00:00
|
|
|
result = subprocess.run("lightning run model --help", capture_output=True, text=True, shell=True)
|
2024-02-12 16:22:53 +00:00
|
|
|
|
|
|
|
deprecation_message = (
|
|
|
|
"`lightning run model` is deprecated and will be removed in future versions. "
|
2024-02-27 16:36:46 +00:00
|
|
|
"Please call `fabric run` instead"
|
2024-02-12 16:22:53 +00:00
|
|
|
)
|
2024-02-27 16:36:46 +00:00
|
|
|
message = "Usage: lightning run [OPTIONS] SCRIPT [SCRIPT_ARGS]"
|
2024-02-12 16:22:53 +00:00
|
|
|
assert deprecation_message in result.stdout
|
|
|
|
assert message in result.stdout or message in result.stderr
|
2024-03-04 13:01:33 +00:00
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("lightning.fabric.cli._process_cli_args")
|
|
|
|
@mock.patch("lightning.fabric.cli._load_distributed_checkpoint")
|
|
|
|
@mock.patch("lightning.fabric.cli.torch.save")
|
|
|
|
def test_consolidate(save_mock, _, __, tmp_path):
|
|
|
|
ioerr = StringIO()
|
|
|
|
with pytest.raises(SystemExit) as e, contextlib.redirect_stderr(ioerr):
|
|
|
|
_consolidate.main(["not exist"])
|
|
|
|
assert e.value.code == 2
|
|
|
|
assert "Path 'not exist' does not exist" in ioerr.getvalue()
|
|
|
|
|
|
|
|
checkpoint_folder = tmp_path / "checkpoint"
|
|
|
|
checkpoint_folder.mkdir()
|
|
|
|
ioerr = StringIO()
|
|
|
|
with pytest.raises(SystemExit) as e, contextlib.redirect_stderr(ioerr):
|
|
|
|
_consolidate.main([str(checkpoint_folder)])
|
|
|
|
assert e.value.code == 0
|
|
|
|
save_mock.assert_called_once()
|