diff --git a/examples/fabric/build_your_own_trainer/trainer.py b/examples/fabric/build_your_own_trainer/trainer.py
index 08692377d8..69895b6498 100644
--- a/examples/fabric/build_your_own_trainer/trainer.py
+++ b/examples/fabric/build_your_own_trainer/trainer.py
@@ -344,7 +344,7 @@ class MyCustomTrainer:
         Args:
             model: The LightningModule to train
             scheduler_cfg: The learning rate scheduler configuration.
-                Have a look at :meth:`lightning.pytorch.LightninModule.configure_optimizers` for supported values.
+                Have a look at :meth:`lightning.pytorch.LightningModule.configure_optimizers` for supported values.
             level: whether we are trying to step on epoch- or step-level
             current_value: Holds the current_epoch if ``level==epoch``, else holds the ``global_step``
         """
diff --git a/src/lightning/fabric/plugins/precision/double.py b/src/lightning/fabric/plugins/precision/double.py
index cf7c80acde..05419a5d00 100644
--- a/src/lightning/fabric/plugins/precision/double.py
+++ b/src/lightning/fabric/plugins/precision/double.py
@@ -46,7 +46,7 @@ class DoublePrecision(Precision):
     def forward_context(self) -> Generator[None, None, None]:
         """A context manager to change the default tensor type.
 
-        See: :meth:`torch.set_default_tensor_type`
+        See: :meth:`torch.set_default_dtype`
         """
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.float64)
diff --git a/src/lightning/fabric/plugins/precision/fsdp.py b/src/lightning/fabric/plugins/precision/fsdp.py
index 66c941718e..826415568d 100644
--- a/src/lightning/fabric/plugins/precision/fsdp.py
+++ b/src/lightning/fabric/plugins/precision/fsdp.py
@@ -106,7 +106,7 @@ class FSDPPrecision(Precision):
     def init_context(self) -> Generator[None, None, None]:
         """A context manager to change the default tensor type when initializing module parameters or tensors.
 
-        See: :meth:`torch.set_default_tensor_type`
+        See: :meth:`torch.set_default_dtype`
         """
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(self.mixed_precision_config.param_dtype)
diff --git a/src/lightning/fabric/plugins/precision/half.py b/src/lightning/fabric/plugins/precision/half.py
index 8b087afe37..aa9ac52ffd 100644
--- a/src/lightning/fabric/plugins/precision/half.py
+++ b/src/lightning/fabric/plugins/precision/half.py
@@ -43,7 +43,7 @@ class HalfPrecision(Precision):
     def init_context(self) -> Generator[None, None, None]:
         """A context manager to change the default tensor type when initializing module parameters or tensors.
 
-        See: :meth:`torch.set_default_tensor_type`
+        See: :meth:`torch.set_default_dtype`
         """
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(self._desired_input_dtype)
@@ -55,7 +55,7 @@ class HalfPrecision(Precision):
         """A context manager to change the default tensor type when tensors get created during the module's
         forward.
 
-        See: :meth:`torch.set_default_tensor_type`
+        See: :meth:`torch.set_default_dtype`
         """
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(self._desired_input_dtype)
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
index 75d8ed1546..c0f22ef561 100644
--- a/src/lightning/fabric/strategies/deepspeed.py
+++ b/src/lightning/fabric/strategies/deepspeed.py
@@ -345,7 +345,6 @@ class DeepSpeedStrategy(DDPStrategy, _Sharded):
             raise NotImplementedError(
                 f"`{empty_init=}` is not a valid choice with `DeepSpeedStrategy` when ZeRO stage 3 is enabled."
             )
-        empty_init = empty_init and not self.zero_stage_3
         base_context = super().module_init_context(empty_init=empty_init) if not self.zero_stage_3 else nullcontext()
         with base_context, self.module_sharded_context():
             yield
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
index 617b83452e..01f05841c9 100644
--- a/src/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -46,9 +46,9 @@ warning_cache = WarningCache()
 class ModelCheckpoint(Checkpoint):
     r"""
     Save the model periodically by monitoring a quantity. Every metric logged with
-    :meth:`~lightning.pytorch.core.module.log` or :meth:`~lightning.pytorch.core.module.log_dict` in
-    LightningModule is a candidate for the monitor key. For more information, see
-    :ref:`checkpointing`.
+    :meth:`~lightning.pytorch.core.module.LightningModule.log` or
+    :meth:`~lightning.pytorch.core.module.LightningModule.log_dict` is a candidate for the monitor key.
+    For more information, see :ref:`checkpointing`.
 
     After training finishes, use :attr:`best_model_path` to retrieve the path to the
     best checkpoint file and :attr:`best_model_score` to retrieve its score.
diff --git a/src/lightning/pytorch/plugins/precision/deepspeed.py b/src/lightning/pytorch/plugins/precision/deepspeed.py
index ec030e0d8e..0582bf7727 100644
--- a/src/lightning/pytorch/plugins/precision/deepspeed.py
+++ b/src/lightning/pytorch/plugins/precision/deepspeed.py
@@ -46,7 +46,7 @@ class DeepSpeedPrecisionPlugin(PrecisionPlugin):
             If unsupported ``precision`` is provided.
     """
 
-    def __init__(self, precision: Literal["32-true", "16-mixed", "bf16-mixed"]) -> None:
+    def __init__(self, precision: _PRECISION_INPUT) -> None:
         supported_precision = get_args(_PRECISION_INPUT)
         if precision not in supported_precision:
             raise ValueError(
diff --git a/src/lightning/pytorch/plugins/precision/double.py b/src/lightning/pytorch/plugins/precision/double.py
index 77fa9c4171..e16193bbdf 100644
--- a/src/lightning/pytorch/plugins/precision/double.py
+++ b/src/lightning/pytorch/plugins/precision/double.py
@@ -17,7 +17,7 @@ from typing import Any, cast, Generator, List, Literal, Tuple
 import torch
 import torch.nn as nn
 from lightning_utilities.core.apply_func import apply_to_collection
-from torch import FloatTensor, Tensor
+from torch import Tensor
 from torch.optim import Optimizer
 
 import lightning.pytorch as pl
@@ -91,8 +91,9 @@ class DoublePrecisionPlugin(PrecisionPlugin):
     def forward_context(self) -> Generator[None, None, None]:
         """A context manager to change the default tensor type.
 
-        See: :meth:`torch.set_default_tensor_type`
+        See: :meth:`torch.set_default_dtype`
         """
-        torch.set_default_tensor_type(torch.DoubleTensor)
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch.float64)
         yield
-        torch.set_default_tensor_type(FloatTensor)
+        torch.set_default_dtype(default_dtype)
diff --git a/src/lightning/pytorch/plugins/precision/fsdp.py b/src/lightning/pytorch/plugins/precision/fsdp.py
index 5828c35d52..db1cbcbbf1 100644
--- a/src/lightning/pytorch/plugins/precision/fsdp.py
+++ b/src/lightning/pytorch/plugins/precision/fsdp.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Any, Generator, Literal, Optional
+from typing import Any, Generator, Literal, Optional, TYPE_CHECKING
 
 import torch
 
@@ -20,12 +20,9 @@ from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 from lightning.pytorch.plugins.precision.amp import MixedPrecisionPlugin
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 
-if _TORCH_GREATER_EQUAL_1_12 and torch.distributed.is_available():
-    from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+if TYPE_CHECKING:
+    from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
     from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
-else:
-    MixedPrecision = None  # type: ignore[misc,assignment]
-    ShardedGradScaler = None  # type: ignore[misc,assignment]
 
 
 class FSDPMixedPrecisionPlugin(MixedPrecisionPlugin):
@@ -35,10 +32,12 @@ class FSDPMixedPrecisionPlugin(MixedPrecisionPlugin):
     """
 
     def __init__(
-        self, precision: Literal["16-mixed", "bf16-mixed"], device: str, scaler: Optional[ShardedGradScaler] = None
+        self, precision: Literal["16-mixed", "bf16-mixed"], device: str, scaler: Optional["ShardedGradScaler"] = None
     ) -> None:
         if not _TORCH_GREATER_EQUAL_1_12:
             raise MisconfigurationException("`FSDPMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards.")
+        from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+
         super().__init__(
             precision, device, scaler=(ShardedGradScaler() if scaler is None and str(precision) == "16-mixed" else None)
         )
@@ -54,8 +53,8 @@ class FSDPMixedPrecisionPlugin(MixedPrecisionPlugin):
         )
 
     @property
-    def mixed_precision_config(self) -> Optional[MixedPrecision]:
-        assert MixedPrecision is not None
+    def mixed_precision_config(self) -> "TorchMixedPrecision":
+        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
 
         if self.precision == "16-mixed":
             param_dtype = torch.float32
@@ -70,7 +69,7 @@ class FSDPMixedPrecisionPlugin(MixedPrecisionPlugin):
         else:
             raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.")
 
-        return MixedPrecision(
+        return TorchMixedPrecision(
             param_dtype=param_dtype,
             reduce_dtype=reduce_dtype,
             buffer_dtype=buffer_dtype,
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
index 516e02ff79..9b30a4d622 100644
--- a/src/lightning/pytorch/strategies/deepspeed.py
+++ b/src/lightning/pytorch/strategies/deepspeed.py
@@ -325,6 +325,11 @@ class DeepSpeedStrategy(DDPStrategy):
         return config
 
     def setup_distributed(self) -> None:
+        if not isinstance(self.accelerator, CUDAAccelerator):
+            raise RuntimeError(
+                f"The DeepSpeed strategy is only supported on CUDA GPUs but `{self.accelerator.__class__.__name__}`"
+                " is used."
+            )
         assert self.parallel_devices is not None
         _validate_device_index_selection(self.parallel_devices)
         reset_seed()
@@ -438,11 +443,6 @@ class DeepSpeedStrategy(DDPStrategy):
         if self.lightning_module.trainer.gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             raise MisconfigurationException("DeepSpeed does not support clipping gradients by value.")
 
-        if not isinstance(self.accelerator, CUDAAccelerator):
-            raise MisconfigurationException(
-                f"DeepSpeed strategy is only supported on GPU but `{self.accelerator.__class__.__name__}` is used."
-            )
-
         assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(self.model)
diff --git a/tests/tests_fabric/plugins/precision/test_deepspeed.py b/tests/tests_fabric/plugins/precision/test_deepspeed.py
index 64dfe3ead7..8457667243 100644
--- a/tests/tests_fabric/plugins/precision/test_deepspeed.py
+++ b/tests/tests_fabric/plugins/precision/test_deepspeed.py
@@ -78,7 +78,7 @@ def test_selected_dtype(precision, expected_dtype):
         ("16-true", torch.float16),
     ],
 )
-def test_module_init_context(precision, expected_dtype):
+def test_init_context(precision, expected_dtype):
     plugin = DeepSpeedPrecision(precision=precision)
     with plugin.init_context():
         model = torch.nn.Linear(2, 2)
diff --git a/tests/tests_fabric/plugins/precision/test_half.py b/tests/tests_fabric/plugins/precision/test_half.py
index 115a769750..e01ebc6c83 100644
--- a/tests/tests_fabric/plugins/precision/test_half.py
+++ b/tests/tests_fabric/plugins/precision/test_half.py
@@ -37,7 +37,7 @@ def test_selected_dtype(precision, expected_dtype):
         ("16-true", torch.half),
     ],
 )
-def test_module_init_context(precision, expected_dtype):
+def test_init_context(precision, expected_dtype):
     plugin = HalfPrecision(precision=precision)
     with plugin.init_context():
         model = torch.nn.Linear(2, 2)
diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index a6fd69bfce..ad888b5e34 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -257,9 +257,7 @@ class HookedModel(BoringModel):
         return self._manual_train_batch(*args, **kwargs)
 
     @staticmethod
-    def _auto_train_batch(
-        trainer, model, batches, device=torch.device("cpu"), current_epoch=0, current_batch=0, **kwargs
-    ):
+    def _auto_train_batch(trainer, model, batches, device, current_epoch=0, current_batch=0, **kwargs):
         using_deepspeed = kwargs.get("strategy") == "deepspeed"
         out = []
         for i in range(current_batch, batches):
@@ -312,7 +310,7 @@ class HookedModel(BoringModel):
         return out
 
     @staticmethod
-    def _manual_train_batch(trainer, model, batches, device=torch.device("cpu"), **kwargs):
+    def _manual_train_batch(trainer, model, batches, device, **kwargs):
         using_deepspeed = kwargs.get("strategy") == "deepspeed"
         out = []
         for i in range(batches):
@@ -343,7 +341,7 @@ class HookedModel(BoringModel):
         return out
 
     @staticmethod
-    def _eval_epoch(fn, trainer, model, batches, key, device=torch.device("cpu")):
+    def _eval_epoch(fn, trainer, model, batches, key, device):
         return [
             {"name": f"Callback.on_{fn}_epoch_start", "args": (trainer, model)},
             {"name": f"on_{fn}_epoch_start"},
@@ -353,7 +351,7 @@ class HookedModel(BoringModel):
         ]
 
     @staticmethod
-    def _eval_batch(fn, trainer, model, batches, key, device=torch.device("cpu")):
+    def _eval_batch(fn, trainer, model, batches, key, device):
         out = []
         outputs = {key: ANY}
         for i in range(batches):
@@ -373,13 +371,13 @@ class HookedModel(BoringModel):
         return out
 
     @staticmethod
-    def _predict_batch(trainer, model, batches):
+    def _predict_batch(trainer, model, batches, device):
         out = []
         for i in range(batches):
             out.extend(
                 [
                     {"name": "on_before_batch_transfer", "args": (ANY, 0)},
-                    {"name": "transfer_batch_to_device", "args": (ANY, torch.device("cpu"), 0)},
+                    {"name": "transfer_batch_to_device", "args": (ANY, device, 0)},
                     {"name": "on_after_batch_transfer", "args": (ANY, 0)},
                     {"name": "Callback.on_predict_batch_start", "args": (trainer, model, ANY, i)},
                     {"name": "on_predict_batch_start", "args": (ANY, i)},
@@ -451,7 +449,7 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs, automatic_optimization):
     using_deepspeed = kwargs.get("strategy") == "deepspeed"
     if kwargs.get("precision") == "16-mixed" and not using_deepspeed:
         saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY
-    device = torch.device("cuda:0" if "accelerator" in kwargs and kwargs["accelerator"] == "gpu" else "cpu")
+    device = trainer.strategy.root_device
     expected = [
         {"name": "configure_callbacks"},
         {"name": "prepare_data"},
@@ -570,7 +568,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_epochs(tmpdir):
         {"name": "on_train_start"},
         {"name": "Callback.on_train_epoch_start", "args": (trainer, model)},
         {"name": "on_train_epoch_start"},
-        *model._train_batch(trainer, model, 2, current_epoch=1, current_batch=0),
+        *model._train_batch(trainer, model, 2, trainer.strategy.root_device, current_epoch=1, current_batch=0),
         {"name": "Callback.on_train_epoch_end", "args": (trainer, model)},
         {"name": "on_train_epoch_end"},  # before ModelCheckpoint because it's a "monitoring callback"
         # `ModelCheckpoint.save_checkpoint` is called here
@@ -648,7 +646,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_steps(tmpdir):
         {"name": "on_train_start"},
         {"name": "Callback.on_train_epoch_start", "args": (trainer, model)},
         {"name": "on_train_epoch_start"},
-        *model._train_batch(trainer, model, steps_after_reload, current_batch=1),
+        *model._train_batch(trainer, model, steps_after_reload, trainer.strategy.root_device, current_batch=1),
         {"name": "Callback.on_train_epoch_end", "args": (trainer, model)},
         {"name": "on_train_epoch_end"},  # before ModelCheckpoint because it's a "monitoring callback"
         # `ModelCheckpoint.save_checkpoint` is called here
@@ -691,7 +689,7 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader,
         {"name": "zero_grad"},
         {"name": f"Callback.on_{noun}_start", "args": (trainer, model)},
         {"name": f"on_{noun}_start"},
-        *model._eval_epoch(noun, trainer, model, batches, key),
+        *model._eval_epoch(noun, trainer, model, batches, key, trainer.strategy.root_device),
         {"name": f"Callback.on_{noun}_end", "args": (trainer, model)},
         {"name": f"on_{noun}_end"},
         {"name": "train", "args": (True,)},
@@ -733,7 +731,7 @@ def test_trainer_model_hook_system_predict(tmpdir):
         {"name": "on_predict_start"},
         {"name": "Callback.on_predict_epoch_start", "args": (trainer, model)},
         {"name": "on_predict_epoch_start"},
-        *model._predict_batch(trainer, model, batches),
+        *model._predict_batch(trainer, model, batches, trainer.strategy.root_device),
         {"name": "Callback.on_predict_epoch_end", "args": (trainer, model)},
         {"name": "on_predict_epoch_end"},
         {"name": "Callback.on_predict_end", "args": (trainer, model)},
diff --git a/tests/tests_pytorch/strategies/test_custom_plugin.py b/tests/tests_pytorch/strategies/test_custom_plugin.py
index 3466f2038f..b29a28e5f5 100644
--- a/tests/tests_pytorch/strategies/test_custom_plugin.py
+++ b/tests/tests_pytorch/strategies/test_custom_plugin.py
@@ -43,6 +43,6 @@ def test_strategy_lightning_restore_optimizer_and_schedulers(tmpdir, restore_opt
 
     model = BoringModel()
     strategy = TestStrategy(torch.device("cpu"))
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=strategy)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=strategy, accelerator="cpu")
     trainer.fit(model, ckpt_path=checkpoint_path)
     assert strategy.load_optimizer_state_dict_called == restore_optimizer_and_schedulers
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
index 6aaf930349..ac7aa487f5 100644
--- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
+++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -109,7 +109,10 @@ def test_deepspeed_strategy_string(tmpdir, strategy):
     set."""
 
     trainer = Trainer(
-        fast_dev_run=True, default_root_dir=tmpdir, strategy=strategy if isinstance(strategy, str) else strategy()
+        accelerator="cpu",
+        fast_dev_run=True,
+        default_root_dir=tmpdir,
+        strategy=strategy if isinstance(strategy, str) else strategy(),
     )
 
     assert isinstance(trainer.strategy, DeepSpeedStrategy)
@@ -124,7 +127,7 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config):
         f.write(json.dumps(deepspeed_config))
     monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed")
+    trainer = Trainer(accelerator="cpu", fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed")
 
     strategy = trainer.strategy
     assert isinstance(strategy, DeepSpeedStrategy)
@@ -1225,7 +1228,7 @@ def test_error_with_invalid_accelerator(tmpdir):
         fast_dev_run=True,
     )
     model = BoringModel()
-    with pytest.raises(MisconfigurationException, match="DeepSpeed strategy is only supported on GPU"):
+    with pytest.raises(RuntimeError, match="DeepSpeed strategy is only supported on CUDA"):
         trainer.fit(model)
 
 
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
index 839b69b150..2472c60a36 100644
--- a/tests/tests_pytorch/strategies/test_fsdp.py
+++ b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -203,7 +203,7 @@ def test_invalid_on_cpu(tmpdir):
         MisconfigurationException,
         match=f"You selected strategy to be `{FSDPStrategy.strategy_name}`, but GPU accelerator is not used.",
     ):
-        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
+        trainer = Trainer(accelerator="cpu", default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
         assert isinstance(trainer.strategy, FSDPStrategy)
         trainer.strategy.setup_environment()
 
diff --git a/tests/tests_pytorch/strategies/test_single_device_strategy.py b/tests/tests_pytorch/strategies/test_single_device_strategy.py
index e934ac0d67..85c92ded50 100644
--- a/tests/tests_pytorch/strategies/test_single_device_strategy.py
+++ b/tests/tests_pytorch/strategies/test_single_device_strategy.py
@@ -28,7 +28,7 @@ from tests_pytorch.helpers.runif import RunIf
 
 def test_single_cpu():
     """Tests if device is set correctly for single CPU strategy."""
-    trainer = Trainer()
+    trainer = Trainer(accelerator="cpu")
     assert isinstance(trainer.strategy, SingleDeviceStrategy)
     assert trainer.strategy.root_device == torch.device("cpu")