lightning/tests/tests_fabric/accelerators/test_cuda.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

164 lines
6.0 KiB
Python
Raw Normal View History

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import logging
import os
from re import escape
from unittest import mock
from unittest.mock import Mock
ruff: replace isort with ruff +TPU (#17684) * ruff: replace isort with ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing & imports * lines in warning test * docs * fix enum import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing * import * fix lines * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * type ClusterEnvironment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
import lightning.fabric
import pytest
import torch
from lightning.fabric.accelerators.cuda import (
CUDAAccelerator,
ruff: replace isort with ruff +TPU (#17684) * ruff: replace isort with ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing & imports * lines in warning test * docs * fix enum import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing * import * fix lines * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * type ClusterEnvironment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
_check_cuda_matmul_precision,
find_usable_cuda_devices,
is_cuda_available,
num_cuda_devices,
)
ruff: replace isort with ruff +TPU (#17684) * ruff: replace isort with ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing & imports * lines in warning test * docs * fix enum import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing * import * fix lines * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * type ClusterEnvironment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-09-26 15:54:55 +00:00
from tests_fabric.helpers.runif import RunIf
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
def test_auto_device_count(_):
assert CUDAAccelerator.auto_device_count() == 2
@RunIf(min_cuda_gpus=1)
def test_gpu_availability():
assert CUDAAccelerator.is_available()
def test_init_device_with_wrong_device_type():
with pytest.raises(ValueError, match="Device should be CUDA"):
CUDAAccelerator().setup_device(torch.device("cpu"))
@pytest.mark.parametrize(
("devices", "expected"),
[
([], []),
([1], [torch.device("cuda", 1)]),
([3, 1], [torch.device("cuda", 3), torch.device("cuda", 1)]),
],
)
def test_get_parallel_devices(devices, expected):
assert CUDAAccelerator.get_parallel_devices(devices) == expected
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.get_device_capability", return_value=(7, 0))
def test_set_cuda_device(_, set_device_mock):
device = torch.device("cuda", 1)
CUDAAccelerator().setup_device(device)
set_device_mock.assert_called_once_with(device)
@mock.patch("lightning.fabric.accelerators.cuda._device_count_nvml", return_value=-1)
@mock.patch("torch.cuda.is_available", return_value=True)
@mock.patch("torch.cuda.device_count", return_value=100)
def test_num_cuda_devices_without_nvml(*_):
"""Test that if NVML can't be loaded, our helper functions fall back to the default implementation for determining
CUDA availability."""
num_cuda_devices.cache_clear()
assert is_cuda_available()
assert num_cuda_devices() == 100
num_cuda_devices.cache_clear()
@mock.patch.dict(os.environ, {}, clear=True)
def test_force_nvml_based_cuda_check():
"""Test that we force PyTorch to use the NVML-based CUDA checks."""
importlib.reload(lightning.fabric) # reevaluate top-level code, without becoming a different object
assert os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] == "1"
@mock.patch("torch.cuda.get_device_capability", return_value=(10, 1))
@mock.patch("torch.cuda.get_device_name", return_value="Z100")
@mock.patch("torch.cuda.is_available", return_value=True)
def test_tf32_message(_, __, ___, caplog, monkeypatch):
# for some reason, caplog doesn't work with our rank_zero_info utilities
monkeypatch.setattr(lightning.fabric.accelerators.cuda, "rank_zero_info", logging.info)
device = Mock()
expected = "Z100') that has Tensor Cores"
assert torch.get_float32_matmul_precision() == "highest" # default in torch
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected in caplog.text
_check_cuda_matmul_precision.cache_clear()
caplog.clear()
torch.backends.cuda.matmul.allow_tf32 = True # changing this changes the string
assert torch.get_float32_matmul_precision() == "high"
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert not caplog.text
_check_cuda_matmul_precision.cache_clear()
caplog.clear()
torch.backends.cuda.matmul.allow_tf32 = False
torch.set_float32_matmul_precision("medium") # also the other way around
assert torch.backends.cuda.matmul.allow_tf32
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert not caplog.text
_check_cuda_matmul_precision.cache_clear()
torch.set_float32_matmul_precision("highest") # can be reverted
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected in caplog.text
# subsequent calls don't produce more messages
caplog.clear()
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected not in caplog.text
_check_cuda_matmul_precision.cache_clear()
def test_find_usable_cuda_devices_error_handling():
"""Test error handling for edge cases when using `find_usable_cuda_devices`."""
# Asking for GPUs if no GPUs visible
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0), pytest.raises(
ValueError, match="You requested to find 2 devices but there are no visible CUDA"
):
find_usable_cuda_devices(2)
# Asking for more GPUs than are visible
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1), pytest.raises(
ValueError, match="this machine only has 1 GPUs"
):
find_usable_cuda_devices(2)
# All GPUs are unusable
tensor_mock = Mock(side_effect=RuntimeError) # simulate device placement fails
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2), mock.patch(
"lightning.fabric.accelerators.cuda.torch.tensor", tensor_mock
), pytest.raises(RuntimeError, match=escape("The devices [0, 1] are occupied by other processes")):
find_usable_cuda_devices(2)
# Request for as many GPUs as there are, no error should be raised
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=5), mock.patch(
"lightning.fabric.accelerators.cuda.torch.tensor"
):
assert find_usable_cuda_devices(-1) == [0, 1, 2, 3, 4]
# Edge case
assert find_usable_cuda_devices(0) == []