diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
index 053a153ad9..dd39ddb35d 100644
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@@ -80,7 +80,7 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
     # todo: make it work also with strict loading
     qmodel2 = RegressionModel.load_from_checkpoint(model_path, strict=False)
     quant2_score = torch.mean(torch.tensor([mape(qmodel2(x), y) for x, y in dm.test_dataloader()]))
-    assert torch.allclose(org_score, quant2_score, atol=0.45)
+    assert torch.allclose(org_score, quant2_score, atol=0.47)
 
     # test without and with QAT callback
     trainer_args.update(max_epochs=curr_epoch + 1)
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index fb404b54a1..5a2464f6fd 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -31,6 +31,7 @@ from pytorch_lightning.utilities import (
     _IPU_AVAILABLE,
     _OMEGACONF_AVAILABLE,
     _RICH_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_10,
     _TORCH_QUANTIZE_AVAILABLE,
     _TPU_AVAILABLE,
 )
@@ -67,6 +68,7 @@ class RunIf:
         min_python: Optional[str] = None,
         quantization: bool = False,
         amp_apex: bool = False,
+        bf16_cuda: bool = False,
         tpu: bool = False,
         ipu: bool = False,
         hpu: bool = False,
@@ -93,6 +95,7 @@ class RunIf:
             min_python: Require that Python is greater or equal than this version.
             quantization: Require that `torch.quantization` is available.
             amp_apex: Require that NVIDIA/apex is installed.
+            bf16_cuda: Require that CUDA device supports bf16.
             tpu: Require that TPU is available.
             ipu: Require that IPU is available.
             hpu: Require that HPU is available.
@@ -141,6 +144,20 @@ class RunIf:
             conditions.append(not _APEX_AVAILABLE)
             reasons.append("NVIDIA Apex")
 
+        if bf16_cuda:
+            try:
+                cond = not (torch.cuda.is_available() and _TORCH_GREATER_EQUAL_1_10 and torch.cuda.is_bf16_supported())
+            except (AssertionError, RuntimeError) as e:
+                # AssertionError: Torch not compiled with CUDA enabled
+                # RuntimeError: Found no NVIDIA driver on your system.
+                is_unrelated = "Found no NVIDIA driver" not in str(e) or "Torch not compiled with CUDA" not in str(e)
+                if is_unrelated:
+                    raise e
+                cond = True
+
+            conditions.append(cond)
+            reasons.append("CUDA device bf16")
+
         if skip_windows:
             conditions.append(sys.platform == "win32")
             reasons.append("unimplemented on Windows")
diff --git a/tests/lite/test_parity.py b/tests/lite/test_parity.py
index d5009a6da5..d32cdc3d97 100644
--- a/tests/lite/test_parity.py
+++ b/tests/lite/test_parity.py
@@ -111,7 +111,7 @@ def precision_context(precision, accelerator) -> Generator[None, None, None]:
         pytest.param(32, None, 1, "cpu"),
         pytest.param(32, None, 1, "gpu", marks=RunIf(min_gpus=1)),
         pytest.param(16, None, 1, "gpu", marks=RunIf(min_gpus=1)),
-        pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10")),
+        pytest.param("bf16", None, 1, "gpu", marks=RunIf(min_gpus=1, min_torch="1.10", bf16_cuda=True)),
     ],
 )
 def test_boring_lite_model_single_device(precision, strategy, devices, accelerator, tmpdir):
diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py
index f70a6a9e38..609a2c37e5 100644
--- a/tests/lite/test_wrappers.py
+++ b/tests/lite/test_wrappers.py
@@ -45,9 +45,9 @@ def test_lite_module_wraps():
         (16, torch.float32, torch.float16),
         (16, torch.float64, torch.float16),
         (16, torch.long, torch.long),
-        pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10")),
-        pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10")),
-        pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10")),
+        pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
+        pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
+        pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10", bf16_cuda=True)),
     ],
 )
 def test_lite_module_forward_conversion(precision, input_type, expected_type):
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index f8d973d5cf..152d01aca9 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -262,7 +262,7 @@ def test_horovod_gather(tmpdir):
     _run_horovod(trainer_options)
 
 
-@RunIf(min_gpus=1, horovod_nccl=True, skip_windows=True)
+@RunIf(min_gpus=1, skip_windows=True, horovod=True, horovod_nccl=True)
 def test_horovod_transfer_batch_to_gpu(tmpdir):
     class TestTrainingStepModel(BoringModel):
         def training_step(self, batch, *args, **kwargs):
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index 5a8fabece7..ecf701f7ed 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -395,7 +395,7 @@ class LoggingSyncDistModel(BoringModel):
         return super().validation_step(batch, batch_idx)
 
 
-@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))])
+@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(min_gpus=2, skip_windows=True))])
 def test_logging_sync_dist_true(tmpdir, devices):
     """Tests to ensure that the sync_dist flag works (should just return the original value)"""
     fake_result = 1
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
index 7088432e3b..c109c842ea 100644
--- a/tests/trainer/test_supporters.py
+++ b/tests/trainer/test_supporters.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import os
 from typing import Sequence
 from unittest import mock
@@ -36,6 +37,7 @@ from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForw
 from pytorch_lightning.utilities.data import get_len
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
 def test_tensor_running_accum_reset():
@@ -381,11 +383,12 @@ def test_combined_data_loader_validation_test(
     apply_to_collection(dataloader.loaders, DataLoader, _assert_dataset)
 
 
+@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
 @pytest.mark.parametrize("replace_sampler_ddp", [False, True])
-def test_combined_data_loader_with_max_size_cycle_and_ddp(replace_sampler_ddp):
+def test_combined_data_loader_with_max_size_cycle_and_ddp(accelerator, replace_sampler_ddp):
     """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader
     with ddp and `max_size_cycle` mode."""
-    trainer = Trainer(strategy="ddp", accelerator="auto", devices=2, replace_sampler_ddp=replace_sampler_ddp)
+    trainer = Trainer(strategy="ddp", accelerator=accelerator, devices=2, replace_sampler_ddp=replace_sampler_ddp)
 
     dataloader = CombinedLoader(
         {"a": DataLoader(RandomDataset(32, 8), batch_size=1), "b": DataLoader(RandomDataset(32, 8), batch_size=1)},
@@ -452,19 +455,23 @@ def test_combined_dataloader_for_training_with_ddp(
     }
     if use_combined_loader:
         dataloader = CombinedLoader(dataloader, mode=mode)
-    expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
-    expected_length_after_ddp = expected_length_before_ddp // 2 if replace_sampler_ddp else expected_length_before_ddp
     model = BoringModel()
     trainer = Trainer(
         strategy="ddp",
         accelerator="auto",
-        devices=2,
+        devices="auto",
         replace_sampler_ddp=replace_sampler_ddp,
         multiple_trainloader_mode="max_size_cycle" if use_combined_loader else mode,
     )
     trainer._data_connector.attach_data(
         model=model, train_dataloaders=dataloader, val_dataloaders=None, datamodule=None
     )
+    expected_length_before_ddp = min(n1, n2) if is_min_size_mode else max(n1, n2)
+    expected_length_after_ddp = (
+        math.ceil(expected_length_before_ddp / trainer.num_devices)
+        if replace_sampler_ddp
+        else expected_length_before_ddp
+    )
     trainer.reset_train_dataloader(model=model)
     assert trainer.train_dataloader is not None
     assert isinstance(trainer.train_dataloader, CombinedLoader)
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index de495dd4c1..548b9df8b6 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -987,7 +987,7 @@ def test_gradient_clipping_by_norm(tmpdir, precision):
             # test that gradient is clipped correctly
             parameters = self.parameters()
             grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2)
-            torch.testing.assert_allclose(grad_norm, torch.tensor(0.05))
+            torch.testing.assert_allclose(grad_norm, torch.tensor(0.05, device=self.device))
             self.assertion_called = True
 
     model = TestModel()
@@ -1018,7 +1018,7 @@ def test_gradient_clipping_by_value(tmpdir, precision):
             parameters = self.parameters()
             grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
             grad_max = torch.max(torch.stack(grad_max_list))
-            torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10))
+            torch.testing.assert_allclose(grad_max.abs(), torch.tensor(1e-10, device=self.device))
             self.assertion_called = True
 
     model = TestModel()
@@ -1406,8 +1406,9 @@ def test_trainer_predict_1_gpu(tmpdir):
 
 
 @RunIf(skip_windows=True)
-def test_trainer_predict_ddp_spawn(tmpdir):
-    predict(tmpdir, strategy="ddp_spawn", accelerator="auto", devices=2)
+@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_gpus=2))])
+def test_trainer_predict_ddp_spawn(tmpdir, accelerator):
+    predict(tmpdir, strategy="ddp_spawn", accelerator=accelerator, devices=2)
 
 
 @pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])