94 lines
3.2 KiB
Python
94 lines
3.2 KiB
Python
# Copyright The Lightning AI team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import logging
|
|
import os
|
|
import re
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
from lightning.fabric.plugins.environments import TorchElasticEnvironment
|
|
|
|
|
|
@mock.patch.dict(os.environ, {}, clear=True)
|
|
def test_default_attributes():
|
|
"""Test the default attributes when no environment variables are set."""
|
|
env = TorchElasticEnvironment()
|
|
assert env.creates_processes_externally
|
|
assert env.main_address == "127.0.0.1"
|
|
assert env.main_port == 12910
|
|
with pytest.raises(KeyError):
|
|
# world size is required to be passed as env variable
|
|
env.world_size()
|
|
with pytest.raises(KeyError):
|
|
# local rank is required to be passed as env variable
|
|
env.local_rank()
|
|
assert env.node_rank() == 0
|
|
|
|
|
|
@mock.patch.dict(
|
|
os.environ,
|
|
{
|
|
"MASTER_ADDR": "1.2.3.4",
|
|
"MASTER_PORT": "500",
|
|
"WORLD_SIZE": "20",
|
|
"RANK": "1",
|
|
"LOCAL_RANK": "2",
|
|
"GROUP_RANK": "3",
|
|
},
|
|
)
|
|
def test_attributes_from_environment_variables(caplog):
|
|
"""Test that the torchelastic cluster environment takes the attributes from the environment variables."""
|
|
env = TorchElasticEnvironment()
|
|
assert env.main_address == "1.2.3.4"
|
|
assert env.main_port == 500
|
|
assert env.world_size() == 20
|
|
assert env.global_rank() == 1
|
|
assert env.local_rank() == 2
|
|
assert env.node_rank() == 3
|
|
# setter should be no-op
|
|
with caplog.at_level(logging.DEBUG, logger="lightning.fabric.plugins.environments"):
|
|
env.set_global_rank(100)
|
|
assert env.global_rank() == 1
|
|
assert "setting global rank is not allowed" in caplog.text
|
|
|
|
caplog.clear()
|
|
|
|
with caplog.at_level(logging.DEBUG, logger="lightning.fabric.plugins.environments"):
|
|
env.set_world_size(100)
|
|
assert env.world_size() == 20
|
|
assert "setting world size is not allowed" in caplog.text
|
|
|
|
|
|
def test_detect():
|
|
"""Test the detection of a torchelastic environment configuration."""
|
|
with mock.patch.dict(os.environ, {}, clear=True):
|
|
assert not TorchElasticEnvironment.detect()
|
|
|
|
with mock.patch.dict(
|
|
os.environ,
|
|
{
|
|
"TORCHELASTIC_RUN_ID": "",
|
|
},
|
|
):
|
|
assert TorchElasticEnvironment.detect()
|
|
|
|
|
|
@mock.patch.dict(os.environ, {"WORLD_SIZE": "8"})
|
|
def test_validate_user_settings():
|
|
"""Test that the environment can validate the number of devices and nodes set in Fabric/Trainer."""
|
|
env = TorchElasticEnvironment()
|
|
env.validate_settings(num_devices=4, num_nodes=2)
|
|
with pytest.raises(ValueError, match=re.escape("the product (2 * 2) does not match the world size (8)")):
|
|
env.validate_settings(num_devices=2, num_nodes=2)
|