Fix tests failing on a single GPU (#11753)
Co-authored-by: akihiro@grid.ai <akihiro@grid.ai> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
This commit is contained in:
parent
c233731b7c
commit
568710f2f2
|
@ -80,7 +80,7 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
|
|||
# todo: make it work also with strict loading
|
||||
qmodel2 = RegressionModel.load_from_checkpoint(model_path, strict=False)
|
||||
quant2_score = torch.mean(torch.tensor([mape(qmodel2(x), y) for x, y in dm.test_dataloader()]))
|
||||
assert torch.allclose(org_score, quant2_score, atol=0.45)
|
||||
assert torch.allclose(org_score, quant2_score, atol=0.47)
|
||||
|
||||
# test without and with QAT callback
|
||||
trainer_args.update(max_epochs=curr_epoch + 1)
|
||||
|
|
|
@ -31,6 +31,7 @@ from pytorch_lightning.utilities import (
|
|||
_IPU_AVAILABLE,
|
||||
_OMEGACONF_AVAILABLE,
|
||||
_RICH_AVAILABLE,
|
||||
_TORCH_GREATER_EQUAL_1_10,
|
||||
_TORCH_QUANTIZE_AVAILABLE,
|
||||
_TPU_AVAILABLE,
|
||||
)
|
||||
|
@ -67,6 +68,7 @@ class RunIf:
|
|||
min_python: Optional[str] = None,
|
||||
quantization: bool = False,
|
||||
amp_apex: bool = False,
|
||||
bf16_cuda: bool = False,
|
||||
tpu: bool = False,
|
||||
ipu: bool = False,
|
||||
hpu: bool = False,
|
||||
|
@ -93,6 +95,7 @@ class RunIf:
|
|||
min_python: Require that Python is greater or equal than this version.
|
||||
quantization: Require that `torch.quantization` is available.
|
||||
amp_apex: Require that NVIDIA/apex is installed.
|
||||
bf16_cuda: Require that CUDA device supports bf16.
|
||||
tpu: Require that TPU is available.
|
||||
ipu: Require that IPU is available.
|
||||
hpu: Require that HPU is available.
|
||||
|
@ -141,6 +144,20 @@ class RunIf:
|
|||
conditions.append(not _APEX_AVAILABLE)
|
||||
reasons.append("NVIDIA Apex")
|
||||
|
||||
if bf16_cuda:
|
||||
try:
|
||||
cond = not (torch.cuda.is_available() and _TORCH_GREATER_EQUAL_1_10 and torch.cuda.is_bf16_supported())
|
||||
except (AssertionError, RuntimeError) as e:
|
||||
# AssertionError: Torch not compiled with CUDA enabled
|
||||
# RuntimeError: Found no NVIDIA driver on your system.
|
||||
is_unrelated = "Found no NVIDIA driver" not in str(e) or "Torch not compiled with CUDA" not in str(e)
|
||||
if is_unrelated:
|
||||
raise e
|
||||
cond = True
|
||||
|
||||
conditions.append(cond)
|
||||
reasons.append("CUDA device bf16")
|
||||
|
||||
if skip_windows:
|
||||
conditions.append(sys.platform == "win32")
|
||||
reasons.append("unimplemented on Windows")
|
||||
|
|
|
@ -111,7 +111,7 @@ def precision_context(precision, accelerator) -> Generator[None, None, None]:
|
|||
pytest.param(32, None, 1, "cpu"),
|
||||
pytest.param(32, None, 1, "gpu", marks=RunIf(min_gpus=1)),
|
||||
pytest.param(16, None, 1, "gpu", marks=RunIf(min_gpus=1)),
|
||||
pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10")),
|
||||
pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10", bf16_cuda=True)),
|
||||
],
|
||||
)
|
||||
def test_boring_lite_model_single_device(precision, strategy, devices, accelerator, tmpdir):
|
||||
|
|
|
@ -45,9 +45,9 @@ def test_lite_module_wraps():
|
|||
(16, torch.float32, torch.float16),
|
||||
(16, torch.float64, torch.float16),
|
||||
(16, torch.long, torch.long),
|
||||
pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10")),
|
||||
pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10")),
|
||||
pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10")),
|
||||
pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
|
||||
pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
|
||||
pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
|
||||
],
|
||||
)
|
||||
def test_lite_module_forward_conversion(precision, input_type, expected_type):
|
||||
|
|
|
@ -262,7 +262,7 @@ def test_horovod_gather(tmpdir):
|
|||
_run_horovod(trainer_options)
|
||||
|
||||
|
||||
@RunIf(min_gpus=1, horovod_nccl=True, skip_windows=True)
|
||||
@RunIf(min_gpus=1, skip_windows=True, horovod=True, horovod_nccl=True)
|
||||
def test_horovod_transfer_batch_to_gpu(tmpdir):
|
||||
class TestTrainingStepModel(BoringModel):
|
||||
def training_step(self, batch, *args, **kwargs):
|
||||
|
|
|
@ -395,7 +395,7 @@ class LoggingSyncDistModel(BoringModel):
|
|||
return super().validation_step(batch, batch_idx)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))])
|
||||
@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(min_gpus=2, skip_windows=True))])
|
||||
def test_logging_sync_dist_true(tmpdir, devices):
|
||||
"""Tests to ensure that the sync_dist flag works (should just return the original value)"""
|
||||
fake_result = 1
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import os
|
||||
from typing import Sequence
|
||||
from unittest import mock
|
||||
|
@ -36,6 +37,7 @@ from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForw
|
|||
from pytorch_lightning.utilities.data import get_len
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.helpers.boring_model import BoringModel, RandomDataset
|
||||
from tests.helpers.runif import RunIf
|
||||
|
||||
|
||||
def test_tensor_running_accum_reset():
|
||||
|
@ -381,11 +383,12 @@ def test_combined_data_loader_validation_test(
|
|||
apply_to_collection(dataloader.loaders, DataLoader, _assert_dataset)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
|
||||
@pytest.mark.parametrize("replace_sampler_ddp", [False, True])
|
||||
def test_combined_data_loader_with_max_size_cycle_and_ddp(replace_sampler_ddp):
|
||||
def test_combined_data_loader_with_max_size_cycle_and_ddp(accelerator, replace_sampler_ddp):
|
||||
"""This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader
|
||||
with ddp and `max_size_cycle` mode."""
|
||||
trainer = Trainer(strategy="ddp", accelerator="auto", devices=2, replace_sampler_ddp=replace_sampler_ddp)
|
||||
trainer = Trainer(strategy="ddp", accelerator=accelerator, devices=2, replace_sampler_ddp=replace_sampler_ddp)
|
||||
|
||||
dataloader = CombinedLoader(
|
||||
{"a": DataLoader(RandomDataset(32, 8), batch_size=1), "b": DataLoader(RandomDataset(32, 8), batch_size=1)},
|
||||
|
@ -452,19 +455,23 @@ def test_combined_dataloader_for_training_with_ddp(
|
|||
}
|
||||
if use_combined_loader:
|
||||
dataloader = CombinedLoader(dataloader, mode=mode)
|
||||
expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
|
||||
expected_length_after_ddp = expected_length_before_ddp // 2 if replace_sampler_ddp else expected_length_before_ddp
|
||||
model = BoringModel()
|
||||
trainer = Trainer(
|
||||
strategy="ddp",
|
||||
accelerator="auto",
|
||||
devices=2,
|
||||
devices="auto",
|
||||
replace_sampler_ddp=replace_sampler_ddp,
|
||||
multiple_trainloader_mode="max_size_cycle" if use_combined_loader else mode,
|
||||
)
|
||||
trainer._data_connector.attach_data(
|
||||
model=model, train_dataloaders=dataloader, val_dataloaders=None, datamodule=None
|
||||
)
|
||||
expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
|
||||
expected_length_after_ddp = (
|
||||
math.ceil(expected_length_before_ddp / trainer.num_devices)
|
||||
if replace_sampler_ddp
|
||||
else expected_length_before_ddp
|
||||
)
|
||||
trainer.reset_train_dataloader(model=model)
|
||||
assert trainer.train_dataloader is not None
|
||||
assert isinstance(trainer.train_dataloader, CombinedLoader)
|
||||
|
|
|
@ -987,7 +987,7 @@ def test_gradient_clipping_by_norm(tmpdir, precision):
|
|||
# test that gradient is clipped correctly
|
||||
parameters = self.parameters()
|
||||
grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2)
|
||||
torch.testing.assert_allclose(grad_norm, torch.tensor(0.05))
|
||||
torch.testing.assert_allclose(grad_norm, torch.tensor(0.05, device=self.device))
|
||||
self.assertion_called = True
|
||||
|
||||
model = TestModel()
|
||||
|
@ -1018,7 +1018,7 @@ def test_gradient_clipping_by_value(tmpdir, precision):
|
|||
parameters = self.parameters()
|
||||
grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
|
||||
grad_max = torch.max(torch.stack(grad_max_list))
|
||||
torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10))
|
||||
torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10, device=self.device))
|
||||
self.assertion_called = True
|
||||
|
||||
model = TestModel()
|
||||
|
@ -1406,8 +1406,9 @@ def test_trainer_predict_1_gpu(tmpdir):
|
|||
|
||||
|
||||
@RunIf(skip_windows=True)
|
||||
def test_trainer_predict_ddp_spawn(tmpdir):
|
||||
predict(tmpdir, strategy="ddp_spawn", accelerator="auto", devices=2)
|
||||
@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
|
||||
def test_trainer_predict_ddp_spawn(tmpdir, accelerator):
|
||||
predict(tmpdir, strategy="ddp_spawn", accelerator=accelerator, devices=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])
|
||||
|
|
Loading…
Reference in New Issue