diff --git a/docs/source/accelerators/gpu.rst b/docs/source/accelerators/gpu.rst
index a5f7328718..8e8e5a8c05 100644
--- a/docs/source/accelerators/gpu.rst
+++ b/docs/source/accelerators/gpu.rst
@@ -244,8 +244,8 @@ The table below lists examples of possible input formats and how they are interp
 
 .. note::
 
-    When specifying number of gpus as an integer ``devices=k``, setting the trainer flag
-    ``auto_select_gpus=True`` will automatically help you find ``k`` gpus that are not
+    When specifying number of ``devices`` as an integer ``devices=k``, setting the trainer flag
+    ``auto_select_gpus=True`` will automatically help you find ``k`` GPUs that are not
     occupied by other processes. This is especially useful when GPUs are configured
     to be in "exclusive mode", such that only one process at a time can access them.
     For more details see the :doc:`trainer guide <../common/trainer>`.
@@ -295,7 +295,7 @@ For a deeper understanding of what Lightning is doing, feel free to read this
 Data Parallel
 ^^^^^^^^^^^^^
 :class:`~torch.nn.DataParallel` (DP) splits a batch across k GPUs.
-That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples,
+That is, if you have a batch of 32 and use DP with 2 GPUs, each GPU will process 16 samples,
 after which the root node will aggregate the results.
 
 .. warning:: DP use is discouraged by PyTorch and Lightning. State is not maintained on the replicas created by the
@@ -749,7 +749,7 @@ Let's say you have a batch size of 7 in your dataloader.
         def train_dataloader(self):
             return Dataset(..., batch_size=7)
 
-In DDP, DDP_SPAWN, Deepspeed, DDP_SHARDED, or Horovod your effective batch size will be 7 * gpus * num_nodes.
+In DDP, DDP_SPAWN, Deepspeed, DDP_SHARDED, or Horovod your effective batch size will be 7 * devices * num_nodes.
 
 .. code-block:: python
 
@@ -786,7 +786,7 @@ The reason is that the full batch is visible to all GPUs on the node when using
 
 Torch Distributed Elastic
 -------------------------
-Lightning supports the use of Torch Distributed Elastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of gpus you want to use in the trainer.
+Lightning supports the use of Torch Distributed Elastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of GPUs you want to use in the trainer.
 
 .. code-block:: python
 
diff --git a/docs/source/accelerators/ipu.rst b/docs/source/accelerators/ipu.rst
index dd4207305f..b85f9124c2 100644
--- a/docs/source/accelerators/ipu.rst
+++ b/docs/source/accelerators/ipu.rst
@@ -34,7 +34,7 @@ Specify the number of IPUs to train with. Note that when training with IPUs, you
 
 .. code-block:: python
 
-    trainer = pl.Trainer(ipus=8)  # Train using data parallel on 8 IPUs
+    trainer = pl.Trainer(accelerator="ipu", devices=8)  # Train using data parallel on 8 IPUs
 
 IPUs only support specifying a single number to allocate devices, which is handled via the underlying libraries.
 
@@ -53,7 +53,7 @@ set the precision flag.
     import pytorch_lightning as pl
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, precision=16)
+    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
     trainer.fit(model)
 
 You can also use pure 16-bit training, where the weights are also in 16-bit precision.
@@ -65,7 +65,7 @@ You can also use pure 16-bit training, where the weights are also in 16-bit prec
 
     model = MyLightningModule()
     model = model.half()
-    trainer = pl.Trainer(ipus=8, precision=16)
+    trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16)
     trainer.fit(model)
 
 Advanced IPU options
@@ -83,7 +83,7 @@ IPUs provide further optimizations to speed up training. By using the ``IPUStrat
     from pytorch_lightning.strategies import IPUStrategy
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, strategy=IPUStrategy(device_iterations=32))
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=32))
     trainer.fit(model)
 
 Note that by default we return the last device iteration loss. You can override this by passing in your own ``poptorch.Options`` and setting the AnchorMode as described in the `PopTorch documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.Options.anchorMode>`__.
@@ -102,7 +102,9 @@ Note that by default we return the last device iteration loss. You can override
     training_opts.anchorMode(poptorch.AnchorMode.All)
     training_opts.deviceIterations(32)
 
-    trainer = Trainer(ipus=8, strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts))
+    trainer = Trainer(
+        accelerator="ipu", devices=8, strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts)
+    )
     trainer.fit(model)
 
 You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information.
@@ -124,7 +126,7 @@ Lightning supports dumping all reports to a directory to open using the tool.
     from pytorch_lightning.strategies import IPUStrategy
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, strategy=IPUStrategy(autoreport_dir="report_dir/"))
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(autoreport_dir="report_dir/"))
     trainer.fit(model)
 
 This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/graph-analyser.html#opening-reports>`__.
@@ -142,7 +144,7 @@ Below is an example using the block annotation in a LightningModule.
 
     Currently, when using model parallelism we do not infer the number of IPUs required for you. This is done via the annotations themselves. If you specify 4 different IDs when defining Blocks, this means your model will be split onto 4 different IPUs.
 
-    This is also mutually exclusive with the Trainer flag. In other words, if your model is split onto 2 IPUs and you set ``Trainer(ipus=4)`` this will require 8 IPUs in total: data parallelism will be used to replicate the two-IPU model 4 times.
+    This is also mutually exclusive with the Trainer flag. In other words, if your model is split onto 2 IPUs and you set ``Trainer(accelerator="ipu", devices=4)`` this will require 8 IPUs in total: data parallelism will be used to replicate the two-IPU model 4 times.
 
     When pipelining the model you must also increase the `device_iterations` to ensure full data saturation of the devices data, i.e whilst one device in the model pipeline processes a batch of data, the other device can start on the next batch. For example if the model is split onto 4 IPUs, we require `device_iterations` to be at-least 4.
 
@@ -174,7 +176,7 @@ Below is an example using the block annotation in a LightningModule.
 
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, strategy=IPUStrategy(device_iterations=20))
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
     trainer.fit(model)
 
 
@@ -217,7 +219,7 @@ You can also use the block context manager within the forward function, or any o
 
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, strategy=IPUStrategy(device_iterations=20))
+    trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20))
     trainer.fit(model)
 
 
diff --git a/docs/source/accelerators/tpu.rst b/docs/source/accelerators/tpu.rst
index 6bea3fb6b7..86f072ec4f 100644
--- a/docs/source/accelerators/tpu.rst
+++ b/docs/source/accelerators/tpu.rst
@@ -127,7 +127,7 @@ TPU core training
 
 Lightning supports training on a single TPU core or 8 TPU cores.
 
-The Trainer parameters ``tpu_cores`` defines how many TPU cores to train on (1 or 8) / Single TPU to train on [1].
+The Trainer parameters ``devices`` along with ``accelerator="tpu"`` defines how many TPU cores to train on (1 or 8) / Single TPU to train on [1].
 
 For Single TPU training, Just pass the TPU core ID [1-8] in a list.
 
diff --git a/docs/source/advanced/model_parallel.rst b/docs/source/advanced/model_parallel.rst
index fa21a4c5f4..18c83bde74 100644
--- a/docs/source/advanced/model_parallel.rst
+++ b/docs/source/advanced/model_parallel.rst
@@ -732,7 +732,8 @@ When enabled, it can result in a performance hit and can be disabled in most cas
     from pytorch_lightning.strategies import DDPStrategy
 
     trainer = pl.Trainer(
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy=DDPStrategy(find_unused_parameters=False),
     )
 
@@ -741,7 +742,8 @@ When enabled, it can result in a performance hit and can be disabled in most cas
     from pytorch_lightning.strategies import DDPSpawnStrategy
 
     trainer = pl.Trainer(
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy=DDPSpawnStrategy(find_unused_parameters=False),
     )
 
@@ -894,7 +896,8 @@ When using Post-localSGD, you must also pass ``model_averaging_period`` to allow
 
     model = MyModel()
     trainer = Trainer(
-        gpus=4,
+        accelerator="gpu",
+        devices=4,
         strategy=DDPStrategy(
             ddp_comm_state=post_localSGD.PostLocalSGDState(
                 process_group=None,
diff --git a/docs/source/clouds/cloud_training.rst b/docs/source/clouds/cloud_training.rst
index 8dea5bd0a1..3f9cbf745e 100644
--- a/docs/source/clouds/cloud_training.rst
+++ b/docs/source/clouds/cloud_training.rst
@@ -32,7 +32,7 @@ You can launch any Lightning model on Grid using the Grid `CLI <https://pypi.org
 
 .. code-block:: bash
 
-    grid run --instance_type v100 --gpus 4 my_model.py --gpus 4 --learning_rate 'uniform(1e-6, 1e-1, 20)' --layers '[2, 4, 8, 16]'
+    grid run --instance_type v100 --gpus 4 my_model.py --accelerator 'gpu' --devices 4 --learning_rate 'uniform(1e-6, 1e-1, 20)' --layers '[2, 4, 8, 16]'
 
 You can also start runs or interactive sessions from the `Grid platform <https://platform.grid.ai>`_, where you can upload datasets, view artifacts, view the logs, the cost, log into tensorboard, and so much more.
 
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index bf22f79dc6..56dfa18a5a 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -217,7 +217,7 @@ as well as custom accelerator instances.
     # CPU accelerator
     trainer = Trainer(accelerator="cpu")
 
-    # Training with GPU Accelerator using 2 gpus
+    # Training with GPU Accelerator using 2 GPUs
     trainer = Trainer(devices=2, accelerator="gpu")
 
     # Training with TPU Accelerator using 8 tpu cores
@@ -350,16 +350,16 @@ auto_select_gpus
 
 |
 
-If enabled and `gpus` is an integer, pick available gpus automatically.
+If enabled and ``devices`` is an integer, pick available GPUs automatically.
 This is especially useful when GPUs are configured to be in "exclusive mode",
 such that only one process at a time can access them.
 
 Example::
 
-    # no auto selection (picks first 2 gpus on system, may fail if other process is occupying)
+    # no auto selection (picks first 2 GPUs on system, may fail if other process is occupying)
     trainer = Trainer(accelerator="gpu", devices=2, auto_select_gpus=False)
 
-    # enable auto selection (will find two available gpus on system)
+    # enable auto selection (will find two available GPUs on system)
     trainer = Trainer(accelerator="gpu", devices=2, auto_select_gpus=True)
 
     # specifies all GPUs regardless of its availability
@@ -696,8 +696,8 @@ See Also:
 gpus
 ^^^^
 
-.. warning:: Setting `Trainer(gpus=x)` is deprecated in v1.6 and will be removed"
-    in v2.0. Please use `Trainer(accelerator='gpu', devices=x)` instead.
+.. warning:: Setting `Trainer(gpus=x)` is deprecated in v1.6 and will be removed
+    in v2.0. Please use `Trainer(accelerator="gpu", devices=x)` instead.
 
 .. raw:: html
 
@@ -1189,7 +1189,7 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin
     trainer = Trainer(precision=32)
 
     # 16-bit precision
-    trainer = Trainer(precision=16, gpus=1)  # works only on CUDA
+    trainer = Trainer(precision=16, accelerator="gpu", devices=1)  # works only on CUDA
 
     # bfloat16 precision
     trainer = Trainer(precision="bf16")
@@ -1214,7 +1214,7 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin
         :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available()
 
         # turn on 16-bit
-        trainer = Trainer(amp_backend="apex", amp_level="O2", precision=16, gpus=1)
+        trainer = Trainer(amp_backend="apex", amp_level="O2", precision=16, accelerator="gpu", devices=1)
 
 
 process_position
@@ -1412,7 +1412,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
 
 .. code-block:: python
 
-    # Training with the DistributedDataParallel strategy on 4 gpus
+    # Training with the DistributedDataParallel strategy on 4 GPUs
     trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)
 
     # Training with the DDP Spawn strategy using 4 cpu processes
diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst
index cd8327d52b..dc8f29dfff 100644
--- a/docs/source/guides/speed.rst
+++ b/docs/source/guides/speed.rst
@@ -37,10 +37,10 @@ Lightning supports a variety of plugins to speed up distributed GPU training. Mo
     # run on 1 gpu
     trainer = Trainer(accelerator="gpu", devices=1)
 
-    # train on 8 gpus, using the DDP strategy
+    # train on 8 GPUs, using the DDP strategy
     trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp")
 
-    # train on multiple GPUs across nodes (uses 8 gpus in total)
+    # train on multiple GPUs across nodes (uses 8 GPUs in total)
     trainer = Trainer(accelerator="gpu", devices=2, num_nodes=4)
 
 
@@ -140,7 +140,7 @@ This is a limitation of Python ``.spawn()`` and PyTorch.
 TPU Training
 ============
 
-You can set the ``tpu_cores`` trainer flag to 1, [7] (specific core) or eight cores.
+You can set the ``devices`` trainer argument to 1, [7] (specific core) or eight cores.
 
 .. code-block:: python
 
@@ -214,7 +214,7 @@ Lightning offers mixed precision training for GPUs and CPUs, as well as bfloat16
     :skipif: torch.cuda.device_count() < 4
 
     # 16-bit precision
-    trainer = Trainer(precision=16, gpus=4)
+    trainer = Trainer(precision=16, accelerator="gpu", devices=4)
 
 
 Read more about :ref:`mixed-precision training <amp>`.
@@ -361,7 +361,7 @@ Here is an example of an advanced use case:
 
 .. testcode::
 
-    # Scenario for a GAN with gradient accumulation every two batches and optimized for multiple gpus.
+    # Scenario for a GAN with gradient accumulation every two batches and optimized for multiple GPUs.
     class SimpleGAN(LightningModule):
         def __init__(self):
             super().__init__()
diff --git a/docs/source/starter/introduction.rst b/docs/source/starter/introduction.rst
index 791f71d14f..c4ea775bd4 100644
--- a/docs/source/starter/introduction.rst
+++ b/docs/source/starter/introduction.rst
@@ -390,10 +390,10 @@ CPU
     trainer = Trainer()
 
     # train on 8 CPUs
-    trainer = Trainer(num_processes=8)
+    trainer = Trainer(accelerator="cpu", devices=8)
 
     # train on 1024 CPUs across 128 machines
-    trainer = pl.Trainer(num_processes=8, num_nodes=128)
+    trainer = pl.Trainer(accelerator="cpu", devices=8, num_nodes=128)
 
 GPU
 ---
@@ -403,10 +403,10 @@ GPU
     # train on 1 GPU
     trainer = pl.Trainer(accelerator="gpu", devices=1)
 
-    # train on multiple GPUs across nodes (32 gpus here)
+    # train on multiple GPUs across nodes (32 GPUs here)
     trainer = pl.Trainer(accelerator="gpu", devices=4, num_nodes=8)
 
-    # train on gpu 1, 3, 5 (3 gpus total)
+    # train on gpu 1, 3, 5 (3 GPUs total)
     trainer = pl.Trainer(accelerator="gpu", devices=[1, 3, 5])
 
     # Multi GPU with mixed precision
@@ -437,7 +437,7 @@ IPU
 .. code-block:: python
 
     # Train on IPUs
-    trainer = pl.Trainer(ipus=8)
+    trainer = pl.Trainer(accelerator="ipu", devices=8)
 
 
 Checkpointing
diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst
index 95263dff12..2a838d75a4 100644
--- a/docs/source/starter/lightning_lite.rst
+++ b/docs/source/starter/lightning_lite.rst
@@ -182,7 +182,7 @@ Here is an example while running on 256 GPUs (eight GPUs times 32 nodes).
             self.barrier()
 
 
-    Lite(strategy="ddp", gpus=8, num_nodes=32, accelerator="gpu").run()
+    Lite(strategy="ddp", devices=8, num_nodes=32, accelerator="gpu").run()
 
 
 If you require custom data or model device placement, you can deactivate
diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md
index e4133df93b..2420cde190 100644
--- a/pl_examples/basic_examples/README.md
+++ b/pl_examples/basic_examples/README.md
@@ -50,7 +50,7 @@ This script shows you the result of the conversion to the `LightningModule` and
 python mnist_examples/image_classifier_4_lightning_module.py
 
 # GPUs (any number)
-python mnist_examples/image_classifier_4_lightning_module.py --trainer.gpus 2
+python mnist_examples/image_classifier_4_lightning_module.py --trainer.accelerator 'gpu' --trainer.devices 2
 ```
 
 ______________________________________________________________________
@@ -64,10 +64,10 @@ This script shows you how to extract the data related components into a `Lightni
 python mnist_examples/image_classifier_5_lightning_datamodule.py
 
 # GPUs (any number)
-python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.gpus 2
+python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.accelerator 'gpu' --trainer.devices 2
 
 # Distributed Data Parallel (DDP)
-python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.gpus 2 --trainer.strategy 'ddp'
+python mnist_examples/image_classifier_5_lightning_datamodule.py --trainer.accelerator 'gpu' --trainer.devices 2 --trainer.strategy 'ddp'
 ```
 
 ______________________________________________________________________
@@ -81,10 +81,10 @@ This script shows you how to implement a CNN auto-encoder.
 python autoencoder.py
 
 # GPUs (any number)
-python autoencoder.py --trainer.gpus 2
+python autoencoder.py --trainer.accelerator 'gpu' --trainer.devices 2
 
 # Distributed Data Parallel (DDP)
-python autoencoder.py --trainer.gpus 2 --trainer.strategy 'ddp'
+python autoencoder.py --trainer.accelerator 'gpu' --trainer.devices 2 --trainer.strategy 'ddp'
 ```
 
 ______________________________________________________________________
@@ -99,10 +99,10 @@ A system describes a `LightningModule` which takes a single `torch.nn.Module` wh
 python backbone_image_classifier.py
 
 # GPUs (any number)
-python backbone_image_classifier.py --trainer.gpus 2
+python backbone_image_classifier.py --trainer.accelerator 'gpu' --trainer.devices 2
 
 # Distributed Data Parallel (DDP)
-python backbone_image_classifier.py --trainer.gpus 2 --trainer.strategy 'ddp'
+python backbone_image_classifier.py --trainer.accelerator 'gpu' --trainer.devices 2 --trainer.strategy 'ddp'
 ```
 
 ______________________________________________________________________
diff --git a/pl_examples/basic_examples/mnist_examples/README.md b/pl_examples/basic_examples/mnist_examples/README.md
index c82960af1f..2cc749509f 100644
--- a/pl_examples/basic_examples/mnist_examples/README.md
+++ b/pl_examples/basic_examples/mnist_examples/README.md
@@ -46,7 +46,7 @@ This script shows you the result of the conversion to the `LightningModule` and
 python image_classifier_4_lightning_module.py
 
 # GPUs (any number)
-python image_classifier_4_lightning_module.py --trainer.gpus 2
+python image_classifier_4_lightning_module.py --trainer.accelerator 'gpu' --trainer.devices 2
 ```
 
 ______________________________________________________________________
@@ -60,8 +60,8 @@ This script shows you how to extract the data related components into a `Lightni
 python image_classifier_5_lightning_datamodule.py
 
 # GPUs (any number)
-python image_classifier_5_lightning_datamodule.py --trainer.gpus 2
+python image_classifier_5_lightning_datamodule.py --trainer.accelerator 'gpu' --trainer.devices 2
 
 # Distributed Data parallel
-python image_classifier_5_lightning_datamodule.py --trainer.gpus 2 --trainer.strategy 'ddp'
+python image_classifier_5_lightning_datamodule.py --trainer.accelerator 'gpu' --trainer.devices 2 --trainer.strategy 'ddp'
 ```
diff --git a/pl_examples/basic_examples/profiler_example.py b/pl_examples/basic_examples/profiler_example.py
index cec8a9727a..2d64af6147 100644
--- a/pl_examples/basic_examples/profiler_example.py
+++ b/pl_examples/basic_examples/profiler_example.py
@@ -40,7 +40,8 @@ DEFAULT_CMD_LINE = (
     "--trainer.limit_train_batches=15",
     "--trainer.limit_val_batches=15",
     "--trainer.profiler=pytorch",
-    f"--trainer.gpus={int(torch.cuda.is_available())}",
+    "--trainer.accelerator=gpu",
+    f"--trainer.devices={int(torch.cuda.is_available())}",
 )
 
 
diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py
index f6f31d105a..3935269ea0 100644
--- a/pl_examples/ipu_examples/mnist.py
+++ b/pl_examples/ipu_examples/mnist.py
@@ -78,7 +78,7 @@ if __name__ == "__main__":
 
     model = LitClassifier()
 
-    trainer = pl.Trainer(max_epochs=2, ipus=8)
+    trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices=8)
 
     trainer.fit(model, datamodule=dm)
     trainer.test(model, datamodule=dm)
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index 00ca558c53..c918b9561c 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -28,7 +28,7 @@ ARGS_DEFAULT = (
     "--trainer.limit_predict_batches 2 "
     "--data.batch_size 32 "
 )
-ARGS_GPU = ARGS_DEFAULT + "--trainer.gpus 1 "
+ARGS_GPU = ARGS_DEFAULT + "--trainer.accelerator gpu --trainer.devices 1 "
 
 
 @pytest.mark.skipif(not _DALI_AVAILABLE, reason="Nvidia DALI required")