lightning/tests/tests_fabric/accelerators/test_cuda.py

164 lines
6.0 KiB
Python

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import logging
import os
from re import escape
from unittest import mock
from unittest.mock import Mock
import lightning.fabric
import pytest
import torch
from lightning.fabric.accelerators.cuda import (
CUDAAccelerator,
_check_cuda_matmul_precision,
find_usable_cuda_devices,
is_cuda_available,
num_cuda_devices,
)
from tests_fabric.helpers.runif import RunIf
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
def test_auto_device_count(_):
assert CUDAAccelerator.auto_device_count() == 2
@RunIf(min_cuda_gpus=1)
def test_gpu_availability():
assert CUDAAccelerator.is_available()
def test_init_device_with_wrong_device_type():
with pytest.raises(ValueError, match="Device should be CUDA"):
CUDAAccelerator().setup_device(torch.device("cpu"))
@pytest.mark.parametrize(
("devices", "expected"),
[
([], []),
([1], [torch.device("cuda", 1)]),
([3, 1], [torch.device("cuda", 3), torch.device("cuda", 1)]),
],
)
def test_get_parallel_devices(devices, expected):
assert CUDAAccelerator.get_parallel_devices(devices) == expected
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.get_device_capability", return_value=(7, 0))
def test_set_cuda_device(_, set_device_mock):
device = torch.device("cuda", 1)
CUDAAccelerator().setup_device(device)
set_device_mock.assert_called_once_with(device)
@mock.patch("lightning.fabric.accelerators.cuda._device_count_nvml", return_value=-1)
@mock.patch("torch.cuda.is_available", return_value=True)
@mock.patch("torch.cuda.device_count", return_value=100)
def test_num_cuda_devices_without_nvml(*_):
"""Test that if NVML can't be loaded, our helper functions fall back to the default implementation for determining
CUDA availability."""
num_cuda_devices.cache_clear()
assert is_cuda_available()
assert num_cuda_devices() == 100
num_cuda_devices.cache_clear()
@mock.patch.dict(os.environ, {}, clear=True)
def test_force_nvml_based_cuda_check():
"""Test that we force PyTorch to use the NVML-based CUDA checks."""
importlib.reload(lightning.fabric) # reevaluate top-level code, without becoming a different object
assert os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] == "1"
@mock.patch("torch.cuda.get_device_capability", return_value=(10, 1))
@mock.patch("torch.cuda.get_device_name", return_value="Z100")
@mock.patch("torch.cuda.is_available", return_value=True)
def test_tf32_message(_, __, ___, caplog, monkeypatch):
# for some reason, caplog doesn't work with our rank_zero_info utilities
monkeypatch.setattr(lightning.fabric.accelerators.cuda, "rank_zero_info", logging.info)
device = Mock()
expected = "Z100') that has Tensor Cores"
assert torch.get_float32_matmul_precision() == "highest" # default in torch
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected in caplog.text
_check_cuda_matmul_precision.cache_clear()
caplog.clear()
torch.backends.cuda.matmul.allow_tf32 = True # changing this changes the string
assert torch.get_float32_matmul_precision() == "high"
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert not caplog.text
_check_cuda_matmul_precision.cache_clear()
caplog.clear()
torch.backends.cuda.matmul.allow_tf32 = False
torch.set_float32_matmul_precision("medium") # also the other way around
assert torch.backends.cuda.matmul.allow_tf32
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert not caplog.text
_check_cuda_matmul_precision.cache_clear()
torch.set_float32_matmul_precision("highest") # can be reverted
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected in caplog.text
# subsequent calls don't produce more messages
caplog.clear()
with caplog.at_level(logging.INFO):
_check_cuda_matmul_precision(device)
assert expected not in caplog.text
_check_cuda_matmul_precision.cache_clear()
def test_find_usable_cuda_devices_error_handling():
"""Test error handling for edge cases when using `find_usable_cuda_devices`."""
# Asking for GPUs if no GPUs visible
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0), pytest.raises(
ValueError, match="You requested to find 2 devices but there are no visible CUDA"
):
find_usable_cuda_devices(2)
# Asking for more GPUs than are visible
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1), pytest.raises(
ValueError, match="this machine only has 1 GPUs"
):
find_usable_cuda_devices(2)
# All GPUs are unusable
tensor_mock = Mock(side_effect=RuntimeError) # simulate device placement fails
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2), mock.patch(
"lightning.fabric.accelerators.cuda.torch.tensor", tensor_mock
), pytest.raises(RuntimeError, match=escape("The devices [0, 1] are occupied by other processes")):
find_usable_cuda_devices(2)
# Request for as many GPUs as there are, no error should be raised
with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=5), mock.patch(
"lightning.fabric.accelerators.cuda.torch.tensor"
):
assert find_usable_cuda_devices(-1) == [0, 1, 2, 3, 4]
# Edge case
assert find_usable_cuda_devices(0) == []