# Copyright The Lightning AI team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import logging import os from re import escape from unittest import mock from unittest.mock import Mock import lightning.fabric import pytest import torch from lightning.fabric.accelerators.cuda import ( CUDAAccelerator, _check_cuda_matmul_precision, find_usable_cuda_devices, is_cuda_available, num_cuda_devices, ) from tests_fabric.helpers.runif import RunIf @mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2) def test_auto_device_count(_): assert CUDAAccelerator.auto_device_count() == 2 @RunIf(min_cuda_gpus=1) def test_gpu_availability(): assert CUDAAccelerator.is_available() def test_init_device_with_wrong_device_type(): with pytest.raises(ValueError, match="Device should be CUDA"): CUDAAccelerator().setup_device(torch.device("cpu")) @pytest.mark.parametrize( ("devices", "expected"), [ ([], []), ([1], [torch.device("cuda", 1)]), ([3, 1], [torch.device("cuda", 3), torch.device("cuda", 1)]), ], ) def test_get_parallel_devices(devices, expected): assert CUDAAccelerator.get_parallel_devices(devices) == expected @mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.get_device_capability", return_value=(7, 0)) def test_set_cuda_device(_, set_device_mock): device = torch.device("cuda", 1) CUDAAccelerator().setup_device(device) set_device_mock.assert_called_once_with(device) @mock.patch("lightning.fabric.accelerators.cuda._device_count_nvml", return_value=-1) @mock.patch("torch.cuda.is_available", return_value=True) @mock.patch("torch.cuda.device_count", return_value=100) def test_num_cuda_devices_without_nvml(*_): """Test that if NVML can't be loaded, our helper functions fall back to the default implementation for determining CUDA availability.""" num_cuda_devices.cache_clear() assert is_cuda_available() assert num_cuda_devices() == 100 num_cuda_devices.cache_clear() @mock.patch.dict(os.environ, {}, clear=True) def test_force_nvml_based_cuda_check(): """Test that we force PyTorch to use the NVML-based CUDA checks.""" importlib.reload(lightning.fabric) # reevaluate top-level code, without becoming a different object assert os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] == "1" @mock.patch("torch.cuda.get_device_capability", return_value=(10, 1)) @mock.patch("torch.cuda.get_device_name", return_value="Z100") def test_tf32_message(_, __, caplog, monkeypatch): # for some reason, caplog doesn't work with our rank_zero_info utilities monkeypatch.setattr(lightning.fabric.accelerators.cuda, "rank_zero_info", logging.info) device = Mock() expected = "Z100') that has Tensor Cores" assert torch.get_float32_matmul_precision() == "highest" # default in torch with caplog.at_level(logging.INFO): _check_cuda_matmul_precision(device) assert expected in caplog.text _check_cuda_matmul_precision.cache_clear() caplog.clear() torch.backends.cuda.matmul.allow_tf32 = True # changing this changes the string assert torch.get_float32_matmul_precision() == "high" with caplog.at_level(logging.INFO): _check_cuda_matmul_precision(device) assert not caplog.text _check_cuda_matmul_precision.cache_clear() caplog.clear() torch.backends.cuda.matmul.allow_tf32 = False torch.set_float32_matmul_precision("medium") # also the other way around assert torch.backends.cuda.matmul.allow_tf32 with caplog.at_level(logging.INFO): _check_cuda_matmul_precision(device) assert not caplog.text _check_cuda_matmul_precision.cache_clear() torch.set_float32_matmul_precision("highest") # can be reverted with caplog.at_level(logging.INFO): _check_cuda_matmul_precision(device) assert expected in caplog.text # subsequent calls don't produce more messages caplog.clear() with caplog.at_level(logging.INFO): _check_cuda_matmul_precision(device) assert expected not in caplog.text _check_cuda_matmul_precision.cache_clear() def test_find_usable_cuda_devices_error_handling(): """Test error handling for edge cases when using `find_usable_cuda_devices`.""" # Asking for GPUs if no GPUs visible with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0), pytest.raises( ValueError, match="You requested to find 2 devices but there are no visible CUDA" ): find_usable_cuda_devices(2) # Asking for more GPUs than are visible with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1), pytest.raises( ValueError, match="this machine only has 1 GPUs" ): find_usable_cuda_devices(2) # All GPUs are unusable tensor_mock = Mock(side_effect=RuntimeError) # simulate device placement fails with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2), mock.patch( "lightning.fabric.accelerators.cuda.torch.tensor", tensor_mock ), pytest.raises(RuntimeError, match=escape("The devices [0, 1] are occupied by other processes")): find_usable_cuda_devices(2) # Request for as many GPUs as there are, no error should be raised with mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=5), mock.patch( "lightning.fabric.accelerators.cuda.torch.tensor" ): assert find_usable_cuda_devices(-1) == [0, 1, 2, 3, 4] # Edge case assert find_usable_cuda_devices(0) == []