fix: `nb` is set total number of devices, when nb is -1. (#4209)

* fix: `nb` is set total number of devices, when nb is -1. Refs: #4207 * feat: add test code 1. test combination `auto_select_gpus`, `gpus` options using Trainer 2. test `pick_multiple_gpus` function directly Refs: #4207 * docs: modify contents in `Select GPU devices` Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * Update CHANGELOG.md Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Roger Shieh <55400948+s-rog@users.noreply.github.com> Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
2020-10-29 18:50:37 +09:00 · 2020-10-29 18:50:37 +09:00 · b459fd26ac
parent ce261e4afe
commit b459fd26ac
6 changed files with 99 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -66,12 +66,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209))
 - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))
 - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341))
 ## [1.0.3] - 2020-10-20
 ### Added
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@ -206,6 +206,8 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`.
    `auto_select_gpus=True` will automatically help you find `k` gpus that are not
    occupied by other processes. This is especially useful when GPUs are configured
    to be in "exclusive mode", such that only one process at a time can access them.
    For more details see the :ref:`Trainer guide <trainer>`.
 Remove CUDA flags
 ^^^^^^^^^^^^^^^^^
--- a/pytorch_lightning/trainer/init.py
+++ b/pytorch_lightning/trainer/init.py
@ -381,6 +381,12 @@ Example::
    # enable auto selection (will find two available gpus on system)
    trainer = Trainer(gpus=2, auto_select_gpus=True)
    # specifies all GPUs regardless of its availability
    Trainer(gpus=-1, auto_select_gpus=False)
    # specifies all available GPUs (if only one GPU is not occupied, uses one gpu)
    Trainer(gpus=-1, auto_select_gpus=True)
 auto_lr_find
 ^^^^^^^^^^^^
--- a/pytorch_lightning/tuner/auto_gpu_select.py
+++ b/pytorch_lightning/tuner/auto_gpu_select.py
@ -13,8 +13,18 @@
 # limitations under the License.
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 def pick_multiple_gpus(nb):
    if nb == 0:
        raise MisconfigurationException(
            r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
            Please select a valid number of GPU resources when using auto_select_gpus."
        )
    nb = torch.cuda.device_count() if nb == -1 else nb
    picked = []
    for _ in range(nb):
        picked.append(pick_single_gpu(exclude_gpus=picked))
--- a/tests/tuner/init.py
+++ b/tests/tuner/init.py
--- a/tests/tuner/test_auto_gpu_select.py
+++ b/tests/tuner/test_auto_gpu_select.py
@ -0,0 +1,74 @@
 # Copyright The PyTorch Lightning team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import pytest
 import torch
 from pytorch_lightning import Trainer
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@pytest.mark.skipif(
    torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1"
 )
@pytest.mark.parametrize(
    ["auto_select_gpus", "gpus", "expected_error"],
    [
        (True, 0, MisconfigurationException),
        (True, -1, None),
        (False, 0, None),
        (False, -1, None),
    ],
 )
 def test_trainer_with_gpus_options_combination_at_available_gpus_env(
    auto_select_gpus, gpus, expected_error
 ):
    if expected_error:
        with pytest.raises(
            expected_error,
            match=re.escape(
                r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
            Please select a valid number of GPU resources when using auto_select_gpus."
            ),
        ):
            trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
    else:
        trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
@pytest.mark.skipif(
    torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1"
 )
@pytest.mark.parametrize(
    ["nb", "expected_gpu_idxs", "expected_error"],
    [
        (0, [], MisconfigurationException),
        (-1, [i for i in range(torch.cuda.device_count())], None),
        (1, [0], None),
    ],
 )
 def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error):
    if expected_error:
        with pytest.raises(
            expected_error,
            match=re.escape(
                r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
            Please select a valid number of GPU resources when using auto_select_gpus."
            ),
        ):
            pick_multiple_gpus(nb)
    else:
        assert expected_gpu_idxs == pick_multiple_gpus(nb)