lightning/tests/tests_pytorch/plugins/test_cluster_integration.py

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from unittest import mock

import pytest
import torch

from lightning.fabric.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment
from lightning.pytorch import Trainer
from lightning.pytorch.strategies import DDPStrategy, DeepSpeedStrategy
from lightning.pytorch.utilities.rank_zero import rank_zero_only
from tests_pytorch.helpers.runif import RunIf


def environment_combinations():
    expected = {"global_rank": 3, "local_rank": 1, "node_rank": 1, "world_size": 4}
    # Lightning
    variables = {"CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "NODE_RANK": "1", "WORLD_SIZE": "8"}
    environment = LightningEnvironment()
    yield environment, variables, expected
    # SLURM
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_LOCALID": "1",
        "SLURM_NODEID": "1",
        "SLURM_PROCID": "3",
        "SLURM_NTASKS": "4",
        "SLURM_NTASKS_PER_NODE": "2",
    }
    environment = SLURMEnvironment()
    yield environment, variables, expected
    # TorchElastic
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "GROUP_RANK": "1",
        "RANK": "3",
        "WORLD_SIZE": "4",
        "LOCAL_WORLD_SIZE": "2",
        "TORCHELASTIC_RUN_ID": "1",
    }
    environment = TorchElasticEnvironment()
    yield environment, variables, expected


@RunIf(mps=False)
@pytest.mark.parametrize(
    "strategy_cls",
    [DDPStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],
)
@mock.patch("lightning.pytorch.accelerators.cuda.CUDAAccelerator.is_available", return_value=True)
def test_ranks_available_manual_strategy_selection(_, strategy_cls):
    """Test that the rank information is readily available after Trainer initialization."""
    num_nodes = 2
    for cluster, variables, expected in environment_combinations():
        with mock.patch.dict(os.environ, variables):
            strategy = strategy_cls(
                parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
            )
            trainer = Trainer(strategy=strategy, num_nodes=num_nodes)
            assert rank_zero_only.rank == expected["global_rank"]
            assert trainer.global_rank == expected["global_rank"]
            assert trainer.local_rank == expected["local_rank"]
            assert trainer.node_rank == expected["node_rank"]
            assert trainer.world_size == expected["world_size"]


@pytest.mark.parametrize(
    "trainer_kwargs",
    [
        {"strategy": "ddp", "accelerator": "cpu", "devices": 2},
        {"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 2},
        pytest.param({"strategy": "ddp", "accelerator": "gpu", "devices": [1, 2]}, marks=RunIf(mps=False)),
        pytest.param({"strategy": "ddp_spawn", "accelerator": "gpu", "devices": [1, 2]}, marks=RunIf(mps=False)),
    ],
)
def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwargs):
    """Test that the rank information is readily available after Trainer initialization."""
    num_nodes = 2
    trainer_kwargs.update(num_nodes=num_nodes)

    for cluster, variables, expected in environment_combinations():
        if trainer_kwargs["strategy"] == "ddp_spawn":
            if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)):
                # slurm and torchelastic do not work with spawn strategies
                continue
            # when using spawn, we don't reach rank > 0 until we call Trainer.fit()
            # LOCAL_RANK is only set after we spawned
            if "LOCAL_RANK" not in variables:
                expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0)

        with mock.patch.dict(os.environ, variables):
            trainer = Trainer(**trainer_kwargs)
            assert type(trainer.strategy.cluster_environment) is type(cluster)
            assert rank_zero_only.rank == expected["global_rank"]
            assert trainer.global_rank == expected["global_rank"]
            assert trainer.local_rank == expected["local_rank"]
            assert trainer.node_rank == expected["node_rank"]
            assert trainer.world_size == expected["world_size"]
[bugfix] Apex never instantiated. (#7274) * update * update * update apex * update * update * update * remove test.py * update * update * update on comments * update changelog * update * update * typo 2021-04-30 17:16:28 +00:00			`# Copyright The Lightning AI team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`import os`
			`from unittest import mock`

			`import pytest`
			`import torch`

tests: switch imports for fabric (#16592) 2023-02-01 20:34:38 +00:00			`from lightning.fabric.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment`
tests: switch imports for pytorch (#16595) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2023-02-02 10:06:45 +00:00			`from lightning.pytorch import Trainer`
			`from lightning.pytorch.strategies import DDPStrategy, DeepSpeedStrategy`
			`from lightning.pytorch.utilities.rank_zero import rank_zero_only`
Future 4/n: test & legacy in test/ folder (#13295) * move: legacy >> test/ * move: tests >> test/ * rename unittests * update CI * tests4pl * tests_pytorch * proxi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ci * link * cli * standalone * fixing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * . * Apply suggestions from code review Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * alone * test -> tests * Standalone fixes * ci * Update * More fixes * Fix coverage * Fix mypy * mypy * Empty-Commit * Fix * mypy just for pl * Fix standalone Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2022-06-15 22:10:49 +00:00			`from tests_pytorch.helpers.runif import RunIf`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00

			`def environment_combinations():`
ruff: fixing flake8-comprehensions (#17385) 2023-04-21 09:07:58 +00:00			`expected = {"global_rank": 3, "local_rank": 1, "node_rank": 1, "world_size": 4}`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`# Lightning`
Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`variables = {"CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "NODE_RANK": "1", "WORLD_SIZE": "8"}`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`environment = LightningEnvironment()`
			`yield environment, variables, expected`
			`# SLURM`
			`variables = {`
			`"CUDA_VISIBLE_DEVICES": "0,1,2,4",`
			`"SLURM_JOB_NAME": "SOME_NAME",`
			`"SLURM_LOCALID": "1",`
			`"SLURM_NODEID": "1",`
			`"SLURM_PROCID": "3",`
			`"SLURM_NTASKS": "4",`
Validate SRUN variables when launching in SLURM (#15011) 2022-10-19 21:42:11 +00:00			`"SLURM_NTASKS_PER_NODE": "2",`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`}`
			`environment = SLURMEnvironment()`
			`yield environment, variables, expected`
			`# TorchElastic`
			`variables = {`
			`"CUDA_VISIBLE_DEVICES": "0,1,2,4",`
			`"LOCAL_RANK": "1",`
			`"GROUP_RANK": "1",`
			`"RANK": "3",`
			`"WORLD_SIZE": "4",`
			`"LOCAL_WORLD_SIZE": "2",`
Refactor `TorchElasticEnvironment.detect` to use `torch.distributed.is_torchelastic_launched` (#12376) * Refactor TorchElasticEnvironment.detect to use native utility from torch.distributed * fix version and tests * fix version * Update tests/accelerators/test_accelerator_connector.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2022-03-21 15:51:24 +00:00			`"TORCHELASTIC_RUN_ID": "1",`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`}`
			`environment = TorchElasticEnvironment()`
			`yield environment, variables, expected`


Error handling for `accelerator="mps"` and `ddp` strategy pairing (#16153) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Nikhil Shenoy <nikhilshenoy@dhcp-128-189-225-81.ubcsecure.wireless.ubc.ca> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Fixes https://github.com/Lightning-AI/lightning/issues/16148 2023-01-12 13:10:11 +00:00			`@RunIf(mps=False)`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`@pytest.mark.parametrize(`
Rename training plugin test files & names to strategy (#11303) 2022-01-04 13:32:45 +00:00			`"strategy_cls",`
Remove the FairScale integration (#16400) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2023-01-23 13:39:04 +00:00			`[DDPStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`)`
tests: switch imports for pytorch (#16595) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2023-02-02 10:06:45 +00:00			`@mock.patch("lightning.pytorch.accelerators.cuda.CUDAAccelerator.is_available", return_value=True)`
Error handling for `accelerator="mps"` and `ddp` strategy pairing (#16153) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Nikhil Shenoy <nikhilshenoy@dhcp-128-189-225-81.ubcsecure.wireless.ubc.ca> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Fixes https://github.com/Lightning-AI/lightning/issues/16148 2023-01-12 13:10:11 +00:00			`def test_ranks_available_manual_strategy_selection(_, strategy_cls):`
Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`"""Test that the rank information is readily available after Trainer initialization."""`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`num_nodes = 2`
Remove the FairScale integration (#16400) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2023-01-23 13:39:04 +00:00			`for cluster, variables, expected in environment_combinations():`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`with mock.patch.dict(os.environ, variables):`
Remove the FairScale integration (#16400) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2023-01-23 13:39:04 +00:00			`strategy = strategy_cls(`
			`parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster`
			`)`
Rename training plugin test files & names to strategy (#11303) 2022-01-04 13:32:45 +00:00			`trainer = Trainer(strategy=strategy, num_nodes=num_nodes)`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`assert rank_zero_only.rank == expected["global_rank"]`
			`assert trainer.global_rank == expected["global_rank"]`
			`assert trainer.local_rank == expected["local_rank"]`
			`assert trainer.node_rank == expected["node_rank"]`
			`assert trainer.world_size == expected["world_size"]`


			`@pytest.mark.parametrize(`
Set `num_nodes` and `sync_batchnorm` From Trainer for Manually Passed Training Type Plugin (#7026) Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-05-08 11:25:51 +00:00			`"trainer_kwargs",`
			`[`
ruff: fixing flake8-comprehensions (#17385) 2023-04-21 09:07:58 +00:00			`{"strategy": "ddp", "accelerator": "cpu", "devices": 2},`
			`{"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 2},`
			`pytest.param({"strategy": "ddp", "accelerator": "gpu", "devices": [1, 2]}, marks=RunIf(mps=False)),`
			`pytest.param({"strategy": "ddp_spawn", "accelerator": "gpu", "devices": [1, 2]}, marks=RunIf(mps=False)),`
Set `num_nodes` and `sync_batchnorm` From Trainer for Manually Passed Training Type Plugin (#7026) Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-05-08 11:25:51 +00:00			`],`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`)`
Error handling for `accelerator="mps"` and `ddp` strategy pairing (#16153) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Nikhil Shenoy <nikhilshenoy@dhcp-128-189-225-81.ubcsecure.wireless.ubc.ca> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Fixes https://github.com/Lightning-AI/lightning/issues/16148 2023-01-12 13:10:11 +00:00			`def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwargs):`
Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`"""Test that the rank information is readily available after Trainer initialization."""`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`num_nodes = 2`
			`trainer_kwargs.update(num_nodes=num_nodes)`

Remove the FairScale integration (#16400) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2023-01-23 13:39:04 +00:00			`for cluster, variables, expected in environment_combinations():`
(1/n) tests: Use strategy flag instead of accelerator for training strategies (#9931) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2021-10-16 15:10:25 +00:00			`if trainer_kwargs["strategy"] == "ddp_spawn":`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)):`
Rename training plugin test files & names to strategy (#11303) 2022-01-04 13:32:45 +00:00			`# slurm and torchelastic do not work with spawn strategies`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`continue`
			`# when using spawn, we don't reach rank > 0 until we call Trainer.fit()`
Merge DDPStrategy and DDPSpawnStrategy in PL (#16809) 2023-02-27 19:43:23 +00:00			`# LOCAL_RANK is only set after we spawned`
			`if "LOCAL_RANK" not in variables:`
			`expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0)`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00
			`with mock.patch.dict(os.environ, variables):`
Remove the FairScale integration (#16400) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2023-01-23 13:39:04 +00:00			`trainer = Trainer(**trainer_kwargs)`
Deprecate Trainer.training_type_plugin in favor of trainer.strategy (#11141) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> 2021-12-22 02:11:43 +00:00			`assert type(trainer.strategy.cluster_environment) is type(cluster)`
Clean up environment access in plugins (#6941) Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> 2021-04-13 18:07:40 +00:00			`assert rank_zero_only.rank == expected["global_rank"]`
			`assert trainer.global_rank == expected["global_rank"]`
			`assert trainer.local_rank == expected["local_rank"]`
			`assert trainer.node_rank == expected["node_rank"]`
			`assert trainer.world_size == expected["world_size"]`