fix: `nb` is set total number of devices, when nb is -1. (#4209)

* fix: `nb` is set total number of devices, when nb is -1. Refs: #4207 * feat: add test code 1. test combination `auto_select_gpus`, `gpus` options using Trainer 2. test `pick_multiple_gpus` function directly Refs: #4207 * docs: modify contents in `Select GPU devices` Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * Update CHANGELOG.md Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Roger Shieh <55400948+s-rog@users.noreply.github.com> Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
2020-10-29 18:50:37 +09:00 · 2020-10-29 18:50:37 +09:00 · b459fd26ac
parent ce261e4afe
commit b459fd26ac
6 changed files with 99 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -66,12 +66,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

 ### Fixed

+
+- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209))
+
+
 - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))

+
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))

+
 - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341))

+
 ## [1.0.3] - 2020-10-20

 ### Added
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@ -206,6 +206,8 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`.
    `auto_select_gpus=True` will automatically help you find `k` gpus that are not
    occupied by other processes. This is especially useful when GPUs are configured
    to be in "exclusive mode", such that only one process at a time can access them.
+    For more details see the :ref:`Trainer guide <trainer>`.
+

 Remove CUDA flags
 ^^^^^^^^^^^^^^^^^
--- a/pytorch_lightning/trainer/init.py
+++ b/pytorch_lightning/trainer/init.py
@ -381,6 +381,12 @@ Example::
    # enable auto selection (will find two available gpus on system)
    trainer = Trainer(gpus=2, auto_select_gpus=True)

+    # specifies all GPUs regardless of its availability
+    Trainer(gpus=-1, auto_select_gpus=False)
+
+    # specifies all available GPUs (if only one GPU is not occupied, uses one gpu)
+    Trainer(gpus=-1, auto_select_gpus=True)
+
 auto_lr_find
 ^^^^^^^^^^^^

--- a/pytorch_lightning/tuner/auto_gpu_select.py
+++ b/pytorch_lightning/tuner/auto_gpu_select.py
@ -13,8 +13,18 @@
 # limitations under the License.
 import torch

+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+

 def pick_multiple_gpus(nb):
+    if nb == 0:
+        raise MisconfigurationException(
+            r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
+            Please select a valid number of GPU resources when using auto_select_gpus."
+        )
+
+    nb = torch.cuda.device_count() if nb == -1 else nb
+
    picked = []
    for _ in range(nb):
        picked.append(pick_single_gpu(exclude_gpus=picked))
--- a/tests/tuner/init.py
+++ b/tests/tuner/init.py
--- a/tests/tuner/test_auto_gpu_select.py
+++ b/tests/tuner/test_auto_gpu_select.py
@ -0,0 +1,74 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+import pytest
+import torch
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1"
+)
+@pytest.mark.parametrize(
+    ["auto_select_gpus", "gpus", "expected_error"],
+    [
+        (True, 0, MisconfigurationException),
+        (True, -1, None),
+        (False, 0, None),
+        (False, -1, None),
+    ],
+)
+def test_trainer_with_gpus_options_combination_at_available_gpus_env(
+    auto_select_gpus, gpus, expected_error
+):
+    if expected_error:
+        with pytest.raises(
+            expected_error,
+            match=re.escape(
+                r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
+            Please select a valid number of GPU resources when using auto_select_gpus."
+            ),
+        ):
+            trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
+    else:
+        trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1"
+)
+@pytest.mark.parametrize(
+    ["nb", "expected_gpu_idxs", "expected_error"],
+    [
+        (0, [], MisconfigurationException),
+        (-1, [i for i in range(torch.cuda.device_count())], None),
+        (1, [0], None),
+    ],
+)
+def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error):
+    if expected_error:
+        with pytest.raises(
+            expected_error,
+            match=re.escape(
+                r"auto_select_gpus=True, gpus=0 is not a valid configuration.\
+            Please select a valid number of GPU resources when using auto_select_gpus."
+            ),
+        ):
+            pick_multiple_gpus(nb)
+    else:
+        assert expected_gpu_idxs == pick_multiple_gpus(nb)