From 8274183bf27a9e90508cf064707f5908370ccb2d Mon Sep 17 00:00:00 2001 From: Isaac Date: Tue, 3 Aug 2021 16:18:51 +0800 Subject: [PATCH] Add check for unique device ids (#8666) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- CHANGELOG.md | 3 +++ pytorch_lightning/utilities/device_parser.py | 21 +++++++++++++++++++- tests/models/test_gpu.py | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0d5ac7caf..ff4a53ed8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `state_id` property to the `Callback` base class ([#6886](https://github.com/PyTorchLightning/pytorch-lightning/pull/6886)) +- Added check for unique GPU ids ([#8666](https://github.com/PyTorchLightning/pytorch-lightning/pull/8666)) + + - Added `ResultCollection` state_dict to Loop `state_dict` and support for distributed reload. ([#8641](https://github.com/PyTorchLightning/pytorch-lightning/pull/8641)) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index a47bf6ca47..fb4fa2965f 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -57,7 +57,7 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i Args: gpus: An int -1 or string '-1' indicate that all available GPUs should be used. - A list of ints or a string containing list of comma separated integers + A list of unique ints or a string containing list of comma separated unique integers indicates specific GPUs to use. An int 0 means that no GPUs should be used. Any int N > 0 indicates that GPUs [0..N) should be used. @@ -88,6 +88,10 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i if TorchElasticEnvironment.is_using_torchelastic() and len(gpus) != 1 and len(_get_all_available_gpus()) == 1: # omit sanity check on torchelastic as by default shows one visible GPU per process return gpus + + # Check that gpus are unique. Duplicate gpus are not supported by the backend. + _check_unique(gpus) + return _sanitize_gpu_ids(gpus) @@ -188,6 +192,21 @@ def _get_all_available_gpus() -> List[int]: return list(range(torch.cuda.device_count())) +def _check_unique(device_ids: List[int]) -> None: + """ + Checks that the device_ids are unique. + + Args: + device_ids: list of ints corresponding to gpus indices + + Raises: + MisconfigurationException: + If ``device_ids`` of GPUs aren't unique + """ + if len(device_ids) != len(set(device_ids)): + raise MisconfigurationException("Device ID's (GPU) must be unique.") + + def _check_data_type(device_ids: Any) -> None: """ Checks that the device_ids argument is one of: None, Int, String or List. diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b954f28023..ec8f8b3b1f 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -217,6 +217,7 @@ def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): pytest.param([-1]), pytest.param([None]), pytest.param(["0"]), + pytest.param([0, 0]), ], ) def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus):