diff --git a/docs/source-pytorch/accelerators/tpu_faq.rst b/docs/source-pytorch/accelerators/tpu_faq.rst
index 8a26899e92..b0d3da9b23 100644
--- a/docs/source-pytorch/accelerators/tpu_faq.rst
+++ b/docs/source-pytorch/accelerators/tpu_faq.rst
@@ -88,10 +88,10 @@ How to setup the debug mode for Training on TPUs?
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
     my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", devices=8, strategy="xla_debug")
+    trainer = L.Trainer(accelerator="tpu", devices=8, strategy="xla_debug")
     trainer.fit(my_model)
 
 Example Metrics report:
diff --git a/docs/source-pytorch/accelerators/tpu_intermediate.rst b/docs/source-pytorch/accelerators/tpu_intermediate.rst
index 8dfe63f336..c03992d873 100644
--- a/docs/source-pytorch/accelerators/tpu_intermediate.rst
+++ b/docs/source-pytorch/accelerators/tpu_intermediate.rst
@@ -44,10 +44,10 @@ To use a full TPU pod skip to the TPU pod section.
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
     my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", devices=8)
+    trainer = L.Trainer(accelerator="tpu", devices=8)
     trainer.fit(my_model)
 
 That's it! Your model will train on all 8 TPU cores.
@@ -113,10 +113,10 @@ By default, TPU training will use 32-bit precision. To enable it, do
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
     my_model = MyLightningModule()
-    trainer = pl.Trainer(accelerator="tpu", precision="16-true")
+    trainer = L.Trainer(accelerator="tpu", precision="16-true")
     trainer.fit(my_model)
 
 Under the hood the xla library will use the `bfloat16 type <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_.
diff --git a/docs/source-pytorch/advanced/model_parallel/deepspeed.rst b/docs/source-pytorch/advanced/model_parallel/deepspeed.rst
index 31550e3011..9689f8c217 100644
--- a/docs/source-pytorch/advanced/model_parallel/deepspeed.rst
+++ b/docs/source-pytorch/advanced/model_parallel/deepspeed.rst
@@ -132,12 +132,11 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
 
 .. code-block:: python
 
-    import lightning.pytorch
-    from lightning.pytorch import Trainer
+    from lightning.pytorch import LightningModule, Trainer
     from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
-    class MyModel(pl.LightningModule):
+    class MyModel(LightningModule):
         ...
 
         def configure_optimizers(self):
@@ -180,7 +179,7 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
     from deepspeed.ops.adam import FusedAdam
 
 
-    class MyModel(pl.LightningModule):
+    class MyModel(LightningModule):
         ...
 
         def configure_optimizers(self):
@@ -202,7 +201,7 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed
     from lightning.pytorch import Trainer
 
 
-    class MyModel(pl.LightningModule):
+    class MyModel(LightningModule):
         ...
 
 
@@ -228,7 +227,7 @@ This reduces the time taken to initialize very large models, as well as ensure w
     from deepspeed.ops.adam import FusedAdam
 
 
-    class MyModel(pl.LightningModule):
+    class MyModel(LightningModule):
         ...
 
         def configure_model(self):
@@ -367,7 +366,7 @@ This saves memory when training larger models, however requires using a checkpoi
     import deepspeed
 
 
-    class MyModel(pl.LightningModule):
+    class MyModel(LightningModule):
         ...
 
         def configure_model(self):
diff --git a/docs/source-pytorch/advanced/training_tricks.rst b/docs/source-pytorch/advanced/training_tricks.rst
index a5c3bfc145..25dd996c62 100644
--- a/docs/source-pytorch/advanced/training_tricks.rst
+++ b/docs/source-pytorch/advanced/training_tricks.rst
@@ -398,7 +398,7 @@ The :class:`~lightning.pytorch.core.datamodule.LightningDataModule` class provid
 
 .. code-block:: python
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def prepare_data(self):
             MNIST(self.data_dir, download=True)
 
@@ -421,7 +421,7 @@ For this, all data pre-loading should be done on the main process inside :meth:`
 
 .. code-block:: python
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def __init__(self, data_dir: str):
             self.mnist = MNIST(data_dir, download=True, transform=T.ToTensor())
 
diff --git a/docs/source-pytorch/cli/lightning_cli_advanced.rst b/docs/source-pytorch/cli/lightning_cli_advanced.rst
index 2960854a88..7a6ed2c96b 100644
--- a/docs/source-pytorch/cli/lightning_cli_advanced.rst
+++ b/docs/source-pytorch/cli/lightning_cli_advanced.rst
@@ -164,7 +164,7 @@ to the class constructor. For example, your model is defined as:
 .. code:: python
 
     # model.py
-    class MyModel(pl.LightningModule):
+    class MyModel(L.LightningModule):
         def __init__(self, criterion: torch.nn.Module):
             self.criterion = criterion
 
diff --git a/docs/source-pytorch/common/checkpointing_advanced.rst b/docs/source-pytorch/common/checkpointing_advanced.rst
index 80b10e1618..89076a87bd 100644
--- a/docs/source-pytorch/common/checkpointing_advanced.rst
+++ b/docs/source-pytorch/common/checkpointing_advanced.rst
@@ -54,9 +54,9 @@ Modify a checkpoint anywhere
 ****************************
 When you need to change the components of a checkpoint before saving or loading, use the :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_save_checkpoint` and :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_load_checkpoint` of your ``LightningModule``.
 
-.. code:: python
+.. code-block:: python
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def on_save_checkpoint(self, checkpoint):
             checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object
 
@@ -65,9 +65,12 @@ When you need to change the components of a checkpoint before saving or loading,
 
 Use the above approach when you need to couple this behavior to your LightningModule for reproducibility reasons. Otherwise, Callbacks also have the :meth:`~lightning.pytorch.callbacks.callback.Callback.on_save_checkpoint` and :meth:`~lightning.pytorch.callbacks.callback.Callback.on_load_checkpoint` which you should use instead:
 
-.. code:: python
+.. code-block:: python
 
-    class LitCallback(pl.Callback):
+    import lightning as L
+
+
+    class LitCallback(L.Callback):
         def on_save_checkpoint(self, checkpoint):
             checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object
 
diff --git a/docs/source-pytorch/common/checkpointing_basic.rst b/docs/source-pytorch/common/checkpointing_basic.rst
index a2e22b7435..57824376c6 100644
--- a/docs/source-pytorch/common/checkpointing_basic.rst
+++ b/docs/source-pytorch/common/checkpointing_basic.rst
@@ -127,7 +127,7 @@ In some cases, we may also pass entire PyTorch modules to the ``__init__`` metho
 
 .. code-block:: python
 
-    class LitAutoencoder(pl.LightningModule):
+    class LitAutoencoder(L.LightningModule):
         def __init__(self, encoder, decoder):
             ...
 
@@ -160,7 +160,7 @@ For example, let's pretend we created a LightningModule like so:
         ...
 
 
-    class Autoencoder(pl.LightningModule):
+    class Autoencoder(L.LightningModule):
         def __init__(self, encoder, decoder, *args, **kwargs):
             ...
 
diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
index 34e87edc0d..cc34c9dee9 100644
--- a/docs/source-pytorch/common/checkpointing_intermediate.rst
+++ b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -27,7 +27,7 @@ Any value that has been logged via *self.log* in the LightningModule can be moni
 
 .. code-block:: python
 
-        class LitModel(pl.LightningModule):
+        class LitModel(L.LightningModule):
             def training_step(self, batch, batch_idx):
                 self.log("my_metric", x)
 
diff --git a/docs/source-pytorch/common/evaluation_basic.rst b/docs/source-pytorch/common/evaluation_basic.rst
index 823f1aba74..3dc7867de1 100644
--- a/docs/source-pytorch/common/evaluation_basic.rst
+++ b/docs/source-pytorch/common/evaluation_basic.rst
@@ -39,7 +39,7 @@ To add a test loop, implement the **test_step** method of the LightningModule
 
 .. code:: python
 
-    class LitAutoEncoder(pl.LightningModule):
+    class LitAutoEncoder(L.LightningModule):
         def training_step(self, batch, batch_idx):
             ...
 
@@ -99,7 +99,7 @@ To add a validation loop, implement the **validation_step** method of the Lightn
 
 .. code:: python
 
-    class LitAutoEncoder(pl.LightningModule):
+    class LitAutoEncoder(L.LightningModule):
         def training_step(self, batch, batch_idx):
             ...
 
@@ -127,5 +127,5 @@ To run the validation loop, pass in the validation set to **.fit**
    model = LitAutoEncoder(...)
 
    # train with both splits
-   trainer = pl.Trainer()
+   trainer = L.Trainer()
    trainer.fit(model, train_loader, valid_loader)
diff --git a/docs/source-pytorch/common/evaluation_intermediate.rst b/docs/source-pytorch/common/evaluation_intermediate.rst
index 1e338269cf..b780650cfe 100644
--- a/docs/source-pytorch/common/evaluation_intermediate.rst
+++ b/docs/source-pytorch/common/evaluation_intermediate.rst
@@ -121,7 +121,7 @@ you can also pass in an :doc:`datamodules <../data/datamodule>` that have overri
 
 .. code-block:: python
 
-    class MyDataModule(pl.LightningDataModule):
+    class MyDataModule(L.LightningDataModule):
         ...
 
         def test_dataloader(self):
diff --git a/docs/source-pytorch/common/lightning_module.rst b/docs/source-pytorch/common/lightning_module.rst
index cf17acd30d..797b25f852 100644
--- a/docs/source-pytorch/common/lightning_module.rst
+++ b/docs/source-pytorch/common/lightning_module.rst
@@ -84,13 +84,13 @@ Here are the only required methods.
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
     import torch
 
     from lightning.pytorch.demos import Transformer
 
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -118,7 +118,7 @@ Which you can train by doing:
     dataloader = DataLoader(dataset)
     model = LightningTransformer(vocab_size=dataset.vocab_size)
 
-    trainer = pl.Trainer(fast_dev_run=100)
+    trainer = L.Trainer(fast_dev_run=100)
     trainer.fit(model=model, train_dataloaders=dataloader)
 
 The LightningModule has many convenient methods, but the core ones you need to know about are:
@@ -157,7 +157,7 @@ To activate the training loop, override the :meth:`~lightning.pytorch.core.Light
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -235,7 +235,7 @@ override the :meth:`~lightning.pytorch.LightningModule.on_train_epoch_end` metho
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -269,7 +269,7 @@ To activate the validation loop while training, override the :meth:`~lightning.p
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def validation_step(self, batch, batch_idx):
             inputs, target = batch
             output = self.model(inputs, target)
@@ -306,7 +306,7 @@ and calling :meth:`~lightning.pytorch.trainer.trainer.Trainer.validate`.
 .. code-block:: python
 
     model = LightningTransformer(vocab_size=dataset.vocab_size)
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.validate(model)
 
 .. note::
@@ -327,7 +327,7 @@ Note that this method is called before :meth:`~lightning.pytorch.LightningModule
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -366,7 +366,7 @@ The only difference is that the test loop is only called when :meth:`~lightning.
 
     model = LightningTransformer(vocab_size=dataset.vocab_size)
     dataloader = DataLoader(dataset)
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.fit(model=model, train_dataloaders=dataloader)
 
     # automatically loads the best weights for you
@@ -377,7 +377,7 @@ There are two ways to call ``test()``:
 .. code-block:: python
 
     # call after training
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.fit(model=model, train_dataloaders=dataloader)
 
     # automatically auto-loads the best weights from the previous run
@@ -387,7 +387,7 @@ There are two ways to call ``test()``:
     model = LightningTransformer.load_from_checkpoint(PATH)
     dataset = WikiText2()
     test_dataloader = DataLoader(dataset)
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.test(model, dataloaders=test_dataloader)
 
 .. note::
@@ -420,7 +420,7 @@ For the example let's override ``predict_step``:
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -447,7 +447,7 @@ There are two ways to call ``predict()``:
 .. code-block:: python
 
     # call after training
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.fit(model=model, train_dataloaders=dataloader)
 
     # automatically auto-loads the best weights from the previous run
@@ -457,7 +457,7 @@ There are two ways to call ``predict()``:
     model = LightningTransformer.load_from_checkpoint(PATH)
     dataset = WikiText2()
     test_dataloader = DataLoader(dataset)
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     predictions = trainer.predict(model, dataloaders=test_dataloader)
 
 Inference in Research
@@ -469,7 +469,7 @@ If you want to perform inference with the system, you can add a ``forward`` meth
 
 .. code-block:: python
 
-    class LightningTransformer(pl.LightningModule):
+    class LightningTransformer(L.LightningModule):
         def __init__(self, vocab_size):
             super().__init__()
             self.model = Transformer(vocab_size=vocab_size)
@@ -500,7 +500,7 @@ such as text generation:
 
 .. code-block:: python
 
-    class Seq2Seq(pl.LightningModule):
+    class Seq2Seq(L.LightningModule):
         def forward(self, x):
             embeddings = self(x)
             hidden_states = self.encoder(embeddings)
@@ -514,7 +514,7 @@ In the case where you want to scale your inference, you should be using
 
 .. code-block:: python
 
-    class Autoencoder(pl.LightningModule):
+    class Autoencoder(L.LightningModule):
         def forward(self, x):
             return self.decoder(x)
 
@@ -538,7 +538,7 @@ For cases like production, you might want to iterate different models inside a L
     from torchmetrics.functional import accuracy
 
 
-    class ClassificationTask(pl.LightningModule):
+    class ClassificationTask(L.LightningModule):
         def __init__(self, model):
             super().__init__()
             self.model = model
@@ -590,7 +590,7 @@ Tasks can be arbitrarily complex such as implementing GAN training, self-supervi
 
 .. code-block:: python
 
-    class GANTask(pl.LightningModule):
+    class GANTask(L.LightningModule):
         def __init__(self, generator, discriminator):
             super().__init__()
             self.generator = generator
@@ -643,7 +643,7 @@ checkpoint, which simplifies model re-instantiation after training.
 
 .. code-block:: python
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(L.LightningModule):
         def __init__(self, layer_1_dim=128, learning_rate=1e-2):
             super().__init__()
             # call this to save (layer_1_dim=128, learning_rate=1e-4) to the checkpoint
@@ -667,7 +667,7 @@ parameters should be provided back when reloading the LightningModule. In this c
 
 .. code-block:: python
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(L.LightningModule):
         def __init__(self, loss_fx, generator_network, layer_1_dim=128):
             super().__init__()
             self.layer_1_dim = layer_1_dim
diff --git a/docs/source-pytorch/data/datamodule.rst b/docs/source-pytorch/data/datamodule.rst
index 44cbc81254..2e970d6f0b 100644
--- a/docs/source-pytorch/data/datamodule.rst
+++ b/docs/source-pytorch/data/datamodule.rst
@@ -79,7 +79,7 @@ The equivalent DataModule just organizes the same exact code, but makes it reusa
 
 .. code-block:: python
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def __init__(self, data_dir: str = "path/to/dir", batch_size: int = 32):
             super().__init__()
             self.data_dir = data_dir
@@ -125,7 +125,7 @@ Here's a more realistic, complex DataModule that shows how much more reusable th
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
     from torch.utils.data import random_split, DataLoader
 
     # Note - you must have torchvision installed for this example
@@ -133,7 +133,7 @@ Here's a more realistic, complex DataModule that shows how much more reusable th
     from torchvision import transforms
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def __init__(self, data_dir: str = "./"):
             super().__init__()
             self.data_dir = data_dir
@@ -171,8 +171,10 @@ Here's a more realistic, complex DataModule that shows how much more reusable th
         def predict_dataloader(self):
             return DataLoader(self.mnist_predict, batch_size=32)
 
+
 ---------------
 
+
 ***********************
 LightningDataModule API
 ***********************
@@ -200,7 +202,7 @@ depends upon :ref:`prepare_data_per_node<data/datamodule:prepare_data_per_node>`
 
 .. code-block:: python
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def prepare_data(self):
             # download
             MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
@@ -226,10 +228,10 @@ There are also data operations you might want to perform on every GPU. Use :meth
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def setup(self, stage: str):
             # Assign Train/val split(s) for use in Dataloaders
             if stage == "fit":
@@ -247,7 +249,7 @@ For eg., if you are working with NLP task where you need to tokenize the text an
 
 .. code-block:: python
 
-    class LitDataModule(LightningDataModule):
+    class LitDataModule(L.LightningDataModule):
         def prepare_data(self):
             dataset = load_Dataset(...)
             train_dataset = ...
@@ -275,10 +277,10 @@ Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:set
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def train_dataloader(self):
             return DataLoader(self.mnist_train, batch_size=64)
 
@@ -292,10 +294,10 @@ Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:set
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def val_dataloader(self):
             return DataLoader(self.mnist_val, batch_size=64)
 
@@ -310,10 +312,10 @@ Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:set
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def test_dataloader(self):
             return DataLoader(self.mnist_test, batch_size=64)
 
@@ -326,10 +328,10 @@ Usually you just wrap the dataset you defined in :ref:`setup<data/datamodule:set
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class MNISTDataModule(pl.LightningDataModule):
+    class MNISTDataModule(L.LightningDataModule):
         def predict_dataloader(self):
             return DataLoader(self.mnist_predict, batch_size=64)
 
@@ -468,10 +470,10 @@ Like LightningModules, DataModules support hyperparameters with the same API.
 
 .. code-block:: python
 
-    import lightning.pytorch as pl
+    import lightning as L
 
 
-    class CustomDataModule(pl.LightningDataModule):
+    class CustomDataModule(L.LightningDataModule):
         def __init__(self, *args, **kwargs):
             super().__init__()
             self.save_hyperparameters()
diff --git a/docs/source-pytorch/deploy/production_advanced_2.rst b/docs/source-pytorch/deploy/production_advanced_2.rst
index 55fc2fe40a..766f80f109 100644
--- a/docs/source-pytorch/deploy/production_advanced_2.rst
+++ b/docs/source-pytorch/deploy/production_advanced_2.rst
@@ -47,7 +47,7 @@ If you want to script a different method, you can decorate the method with :func
 
 .. code-block:: python
 
-    class LitMCdropoutModel(pl.LightningModule):
+    class LitMCdropoutModel(L.LightningModule):
         def __init__(self, model, mc_iteration):
             super().__init__()
             self.model = model
diff --git a/docs/source-pytorch/deploy/production_basic.rst b/docs/source-pytorch/deploy/production_basic.rst
index 4dacb34233..78bcce9fec 100644
--- a/docs/source-pytorch/deploy/production_basic.rst
+++ b/docs/source-pytorch/deploy/production_basic.rst
@@ -50,7 +50,7 @@ When you need to add complicated pre-processing or post-processing logic to your
 
 .. code-block:: python
 
-    class LitMCdropoutModel(pl.LightningModule):
+    class LitMCdropoutModel(L.LightningModule):
         def __init__(self, model, mc_iteration):
             super().__init__()
             self.model = model
diff --git a/docs/source-pytorch/ecosystem/asr_nlp_tts.rst b/docs/source-pytorch/ecosystem/asr_nlp_tts.rst
index 09d5bb05b6..5989f11e60 100644
--- a/docs/source-pytorch/ecosystem/asr_nlp_tts.rst
+++ b/docs/source-pytorch/ecosystem/asr_nlp_tts.rst
@@ -395,7 +395,7 @@ Developing NER Model From Scratch
     # hydra_runner calls hydra.main and is useful for multi-node experiments
     @hydra_runner(config_path="conf", config_name="token_classification_config")
     def main(cfg: DictConfig) -> None:
-        trainer = pl.Trainer(**cfg.trainer)
+        trainer = L.Trainer(**cfg.trainer)
         model = TokenClassificationModel(cfg.model, trainer=trainer)
         trainer.fit(model)
 
@@ -406,7 +406,7 @@ Inference from file:
 .. code-block:: python
 
     gpu = 1 if cfg.trainer.gpus != 0 else 0
-    trainer = pl.Trainer(accelerator="gpu", devices=gpu)
+    trainer = L.Trainer(accelerator="gpu", devices=gpu)
     model.set_trainer(trainer)
     model.evaluate_from_file(
         text_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.text_file),
@@ -638,7 +638,7 @@ Developing TTS Model From Scratch
     # hydra_runner calls hydra.main and is useful for multi-node experiments
     @hydra_runner(config_path="conf", config_name="glow_tts")
     def main(cfg):
-        trainer = pl.Trainer(**cfg.trainer)
+        trainer = L.Trainer(**cfg.trainer)
         model = GlowTTSModel(cfg=cfg.model, trainer=trainer)
         trainer.fit(model)
 
diff --git a/docs/source-pytorch/ecosystem/bolts.rst b/docs/source-pytorch/ecosystem/bolts.rst
index a82184dfc8..916c988190 100644
--- a/docs/source-pytorch/ecosystem/bolts.rst
+++ b/docs/source-pytorch/ecosystem/bolts.rst
@@ -83,9 +83,9 @@ We also have a collection of callbacks.
 .. code-block:: python
 
     from pl_bolts.callbacks import PrintTableMetricsCallback
-    import lightning.pytorch as pl
+    import lightning as L
 
-    trainer = pl.Trainer(callbacks=[PrintTableMetricsCallback()])
+    trainer = L.Trainer(callbacks=[PrintTableMetricsCallback()])
 
     # loss│train_loss│val_loss│epoch
     # ──────────────────────────────
diff --git a/docs/source-pytorch/extensions/datamodules_state.rst b/docs/source-pytorch/extensions/datamodules_state.rst
index 61710d7f11..f765c0603a 100644
--- a/docs/source-pytorch/extensions/datamodules_state.rst
+++ b/docs/source-pytorch/extensions/datamodules_state.rst
@@ -4,7 +4,10 @@ When a checkpoint is created, it asks every DataModule for their state. If your
 
 .. code:: python
 
-    class LitDataModule(pl.DataModuler):
+    import lightning as L
+
+
+    class LitDataModule(L.LightningDataModule):
         def state_dict(self):
             # track whatever you want here
             state = {"current_train_batch_index": self.current_train_batch_index}
diff --git a/docs/source-pytorch/model/build_model_advanced.rst b/docs/source-pytorch/model/build_model_advanced.rst
index 5afb3c6326..a7354050de 100644
--- a/docs/source-pytorch/model/build_model_advanced.rst
+++ b/docs/source-pytorch/model/build_model_advanced.rst
@@ -16,7 +16,10 @@ Inject custom code anywhere in the Training loop using any of the 20+ methods (:
 
 .. testcode::
 
-    class LitModel(pl.LightningModule):
+    import lightning as L
+
+
+    class LitModel(L.LightningModule):
         def backward(self, loss):
             loss.backward()
 
diff --git a/docs/source-pytorch/model/train_model_basic.rst b/docs/source-pytorch/model/train_model_basic.rst
index 028734e153..68111fea5d 100644
--- a/docs/source-pytorch/model/train_model_basic.rst
+++ b/docs/source-pytorch/model/train_model_basic.rst
@@ -21,7 +21,7 @@ Add the relevant imports at the top of the file
     from torchvision import transforms
     from torchvision.datasets import MNIST
     from torch.utils.data import DataLoader
-    import lightning.pytorch as pl
+    import lightning as L
 
 ----
 
@@ -60,7 +60,7 @@ The LightningModule is the full **recipe** that defines how your nn.Modules inte
 
 .. code:: python
 
-    class LitAutoEncoder(pl.LightningModule):
+    class LitAutoEncoder(L.LightningModule):
         def __init__(self, encoder, decoder):
             super().__init__()
             self.encoder = encoder
@@ -104,7 +104,7 @@ To train the model use the Lightning :doc:`Trainer <../common/trainer>` which ha
     autoencoder = LitAutoEncoder(Encoder(), Decoder())
 
     # train model
-    trainer = pl.Trainer()
+    trainer = L.Trainer()
     trainer.fit(model=autoencoder, train_dataloaders=train_loader)
 
 ----
diff --git a/docs/source-pytorch/starter/converting.rst b/docs/source-pytorch/starter/converting.rst
index fabaaaab6b..1b8991f66a 100644
--- a/docs/source-pytorch/starter/converting.rst
+++ b/docs/source-pytorch/starter/converting.rst
@@ -16,7 +16,7 @@ Keep your regular nn.Module architecture
 
 .. testcode::
 
-    import lightning.pytorch as pl
+    import lightning as L
     import torch
     import torch.nn as nn
     import torch.nn.functional as F
@@ -44,7 +44,7 @@ In the training_step of the LightningModule configure how your training routine
 
 .. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def __init__(self, encoder):
             super().__init__()
             self.encoder = encoder
@@ -66,7 +66,7 @@ Move your optimizers to the :meth:`~lightning.pytorch.core.LightningModule.confi
 
 .. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def configure_optimizers(self):
             optimizer = torch.optim.Adam(self.encoder.parameters(), lr=1e-3)
             lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
@@ -81,7 +81,7 @@ If you need a validation loop, configure how your validation routine behaves wit
 
 .. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def validation_step(self, batch, batch_idx):
             x, y = batch
             y_hat = self.encoder(x)
@@ -99,7 +99,7 @@ If you need a test loop, configure how your testing routine behaves with a batch
 
 .. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def test_step(self, batch, batch_idx):
             x, y = batch
             y_hat = self.encoder(x)
@@ -115,7 +115,7 @@ If you need a prediction loop, configure how your prediction routine behaves wit
 
 .. testcode::
 
-    class LitModel(LightningModule):
+    class LitModel(L.LightningModule):
         def predict_step(self, batch, batch_idx):
             x, y = batch
             pred = self.encoder(x)
@@ -135,7 +135,7 @@ If you still need to access the current device, you can use ``self.device`` anyw
 
 .. testcode::
 
-    class LitModel(LightningModule):
+    class LitModel(L.LightningModule):
         def training_step(self, batch, batch_idx):
             z = torch.randn(4, 5, device=self.device)
             ...
@@ -145,7 +145,7 @@ Hint: If you are initializing a :class:`~torch.Tensor` within the ``LightningMod
 
 .. testcode::
 
-    class LitModel(LightningModule):
+    class LitModel(L.LightningModule):
         def __init__(self):
             super().__init__()
             self.register_buffer("running_mean", torch.zeros(num_features))
diff --git a/docs/source-pytorch/starter/introduction.rst b/docs/source-pytorch/starter/introduction.rst
index d1ffb8dbe0..423487290a 100644
--- a/docs/source-pytorch/starter/introduction.rst
+++ b/docs/source-pytorch/starter/introduction.rst
@@ -112,7 +112,7 @@ A LightningModule enables your PyTorch nn.Module to play together in complex way
     from torch import optim, nn, utils, Tensor
     from torchvision.datasets import MNIST
     from torchvision.transforms import ToTensor
-    import lightning.pytorch as pl
+    import lightning as L
 
     # define any number of nn.Modules (or use your current ones)
     encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
@@ -120,7 +120,7 @@ A LightningModule enables your PyTorch nn.Module to play together in complex way
 
 
     # define the LightningModule
-    class LitAutoEncoder(pl.LightningModule):
+    class LitAutoEncoder(L.LightningModule):
         def __init__(self, encoder, decoder):
             super().__init__()
             self.encoder = encoder
@@ -171,7 +171,7 @@ The Lightning :doc:`Trainer <../common/trainer>` "mixes" any :doc:`LightningModu
 .. code-block:: python
 
     # train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
-    trainer = pl.Trainer(limit_train_batches=100, max_epochs=1)
+    trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
     trainer.fit(model=autoencoder, train_dataloaders=train_loader)
 
 The Lightning :doc:`Trainer <../common/trainer>` automates `40+ tricks <../common/trainer.html#trainer-flags>`_ including:
@@ -237,7 +237,7 @@ Enable advanced training features using Trainer arguments. These are state-of-th
     )
 
    # train 1TB+ parameter models with Deepspeed/fsdp
-   trainer = Trainer(
+   trainer = L.Trainer(
        devices=4,
        accelerator="gpu",
        strategy="deepspeed_stage_2",
@@ -245,7 +245,7 @@ Enable advanced training features using Trainer arguments. These are state-of-th
     )
 
    # 20+ helpful flags for rapid idea iteration
-   trainer = Trainer(
+   trainer = L.Trainer(
        max_epochs=10,
        min_epochs=5,
        overfit_batches=1
@@ -276,7 +276,7 @@ Inject custom code anywhere in the Training loop using any of the 20+ methods (:
 
 .. testcode::
 
-    class LitAutoEncoder(pl.LightningModule):
+    class LitAutoEncoder(L.LightningModule):
         def backward(self, loss):
             loss.backward()
 
diff --git a/docs/source-pytorch/starter/style_guide.rst b/docs/source-pytorch/starter/style_guide.rst
index 8902472508..0dac0416c5 100644
--- a/docs/source-pytorch/starter/style_guide.rst
+++ b/docs/source-pytorch/starter/style_guide.rst
@@ -139,7 +139,7 @@ In practice, the code looks like this:
 
 .. code-block::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
 
         def __init__(...):
 
diff --git a/docs/source-pytorch/visualize/logging_basic.rst b/docs/source-pytorch/visualize/logging_basic.rst
index 61de5a9e17..67c8b604f5 100644
--- a/docs/source-pytorch/visualize/logging_basic.rst
+++ b/docs/source-pytorch/visualize/logging_basic.rst
@@ -27,7 +27,7 @@ To track a metric, simply use the *self.log* method available inside the *Lightn
 
 .. code-block:: python
 
-    class LitModel(pl.LightningModule):
+    class LitModel(L.LightningModule):
         def training_step(self, batch, batch_idx):
             value = ...
             self.log("some_value", value)