[fix] Better support for rank_zero_only setting for SLURM and torchelastic (#6802)
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
This commit is contained in:
parent
a2c605785a
commit
86e1d9f759
|
@ -176,6 +176,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Fixed
|
||||
|
||||
- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
|
||||
|
||||
|
||||
- Sanitize `None` params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836))
|
||||
|
||||
|
||||
|
|
|
@ -44,8 +44,18 @@ def rank_zero_only(fn):
|
|||
return wrapped_fn
|
||||
|
||||
|
||||
# TODO: this should be part of the cluster environment
|
||||
def _get_rank() -> int:
|
||||
rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK')
|
||||
for key in rank_keys:
|
||||
rank = os.environ.get(key)
|
||||
if rank is not None:
|
||||
return int(rank)
|
||||
return 0
|
||||
|
||||
|
||||
# add the attribute to the function but don't overwrite in case Trainer has already set it
|
||||
rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
|
||||
rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank())
|
||||
|
||||
|
||||
def _warn(*args, **kwargs):
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
# Copyright The PyTorch Lightning team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from typing import Mapping
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}])
|
||||
def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]):
|
||||
""" Test that SLURM environment variables are properly checked for rank_zero_only. """
|
||||
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
|
||||
rank_zero_only.rank = _get_rank()
|
||||
|
||||
with mock.patch.dict(os.environ, env_vars):
|
||||
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
|
||||
rank_zero_only.rank = _get_rank()
|
||||
|
||||
@rank_zero_only
|
||||
def foo(): # The return type is optional because on non-zero ranks it will not be called
|
||||
return 1
|
||||
|
||||
x = foo()
|
||||
assert x == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rank_key,rank", [
|
||||
("RANK", "1"),
|
||||
("SLURM_PROCID", "2"),
|
||||
("LOCAL_RANK", "3"),
|
||||
])
|
||||
def test_rank_zero_none_set(rank_key, rank):
|
||||
""" Test that function is not called when rank environment variables are not global zero. """
|
||||
|
||||
with mock.patch.dict(os.environ, {rank_key: rank}):
|
||||
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
|
||||
rank_zero_only.rank = _get_rank()
|
||||
|
||||
@rank_zero_only
|
||||
def foo():
|
||||
return 1
|
||||
|
||||
x = foo()
|
||||
assert x is None
|
Loading…
Reference in New Issue