From 9ba76ce60c62f77ea729b3111d7eb79c16fdb7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 7 May 2021 16:10:24 +0200
Subject: [PATCH] Unify `configure_optimizers` docs (#7399)

---
 docs/source/common/optimizers.rst   | 119 ++++------------------------
 pytorch_lightning/core/lightning.py |  96 +++++++++++++++++-----
 2 files changed, 91 insertions(+), 124 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 9b12aff2c1..ca10dc97f1 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -166,17 +166,25 @@ Here is an example training a simple GAN with multiple optimizers.
 
 -----
 
+Learning rate scheduling
+------------------------
+Every optimizer you use can be paired with any
+`Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_. Please see the
+documentation of :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` for all the available options
+
+-----
+
 Learning rate scheduling [manual]
 ---------------------------------
 You can call ``lr_scheduler.step()`` at arbitrary intervals.
-Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers
-defined in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`.
+Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.core.lightning.LightningModule` to access any learning rate schedulers
+defined in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`.
 
 .. warning::
    * Before 1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From
      1.3, ``lr_scheduler.step()`` is now for the user to call at arbitrary intervals.
-   * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in
-     your ``configure_optimizers()`` during manual optimization.
+   * Note that the ``lr_dict`` keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in
+     your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers` during manual optimization.
 
 Here is an example calling ``lr_scheduler.step()`` every step.
 
@@ -341,7 +349,7 @@ Here is an example using a closure function.
 Access your own optimizer [manual]
 ----------------------------------
 ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
-configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer
+configured in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`. You can access your own optimizer
 with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
 support accelerators and precision for you.
 
@@ -402,99 +410,6 @@ In the case of multiple optimizers, Lightning does the following:
 
 -----
 
-Learning rate scheduling
-------------------------
-Every optimizer you use can be paired with any
-`Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_. In the basic
-use-case, the scheduler(s) should be returned as the second output from the
-:meth:`~pytorch_lightning.LightningModule.configure_optimizers` method:
-
-.. testcode:: python
-
-   # no LR scheduler
-   def configure_optimizers(self):
-       return Adam(...)
-
-   # Adam + LR scheduler
-   def configure_optimizers(self):
-       optimizer = Adam(...)
-       scheduler = LambdaLR(optimizer, ...)
-       return [optimizer], [scheduler]
-
-   # Two optimizers each with a scheduler
-   def configure_optimizers(self):
-       optimizer1 = Adam(...)
-       optimizer2 = SGD(...)
-       scheduler1 = LambdaLR(optimizer1, ...)
-       scheduler2 = LambdaLR(optimizer2, ...)
-       return [optimizer1, optimizer2], [scheduler1, scheduler2]
-
-When there are schedulers in which the ``.step()`` method is conditioned on a metric value, such as the
-:class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler, Lightning requires that the output from
-:meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer, with the
-keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
-
-.. testcode::
-
-    # The ReduceLROnPlateau scheduler requires a monitor
-    def configure_optimizers(self):
-        optimizer = Adam(...)
-        return {
-            'optimizer': optimizer,
-            'lr_scheduler': ReduceLROnPlateau(optimizer, ...),
-            'monitor': 'metric_to_track',
-        }
-
-    # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler
-    def configure_optimizers(self):
-       optimizer1 = Adam(...)
-       optimizer2 = SGD(...)
-       scheduler1 = ReduceLROnPlateau(optimizer1, ...)
-       scheduler2 = LambdaLR(optimizer2, ...)
-       return (
-           {'optimizer': optimizer1, 'lr_scheduler': scheduler1, 'monitor': 'metric_to_track'},
-           {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
-       )
-
-.. note::
-    Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)`` in
-    your :class:`~pytorch_lightning.LightningModule`.
-
-By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration
-should be returned as a dict which can contain the following keywords:
-
-* ``"scheduler"`` (required): the actual scheduler object
-* ``"monitor"`` (optional): metric to condition
-* ``"interval"`` (optional): either ``"epoch"`` (default) for stepping after each epoch ends or ``"step"`` for stepping
-  after each optimization step
-* ``"frequency"`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1,
-  corresponding to updating the learning rate after every epoch/step.
-* ``"strict"`` (optional): if set to ``True``, will enforce that value specified in ``"monitor"`` is available while
-  trying to call ``scheduler.step()``, and stop training if not found. If ``False``, it will only give a warning and
-  continue training without calling the scheduler.
-* ``"name"`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the
-  learning rate progress, this keyword can be used to specify a name the learning rate should be logged as.
-
-.. testcode:: python
-
-    # Same as the above example with additional params passed to the first scheduler
-    # In this case the ReduceLROnPlateau will step after every 10 processed batches
-    def configure_optimizers(self):
-       optimizers = [Adam(...), SGD(...)]
-       schedulers = [
-          {
-             'scheduler': ReduceLROnPlateau(optimizers[0], ...),
-             'monitor': 'metric_to_track',
-             'interval': 'step',
-             'frequency': 10,
-             'strict': True,
-          },
-          LambdaLR(optimizers[1], ...)
-       ]
-       return optimizers, schedulers
-
------
-
 Use multiple optimizers (like GANs)
 -----------------------------------
 To use multiple optimizers (optionally with learning rate schedulers), return two or more optimizers from
@@ -540,7 +455,7 @@ Under the hood, Lightning will call each optimizer sequentially:
 Step optimizers at arbitrary intervals
 --------------------------------------
 To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling,
-override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function.
+override the :meth:`~pytorch_lightning.core.lightning.LightningModule.optimizer_step` function.
 
 .. warning::
     If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter to
@@ -591,9 +506,9 @@ Here we add a learning rate warm-up.
 Access your own optimizer
 -------------------------
 ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
-configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer
-with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
-support accelerators and precision for you.
+configured in your :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_optimizers`.
+You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer
+to perform a step, Lightning won't be able to support accelerators and precision for you.
 
 .. testcode:: python
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 109b8fd810..39eb70255d 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1116,30 +1116,77 @@ class LightningModule(
 
             - **Single optimizer**.
             - **List or Tuple** of optimizers.
-            - **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers (or
-              multiple lr_dict).
+            - **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers
+                (or multiple ``lr_dict``).
             - **Dictionary**, with an ``"optimizer"`` key, and (optionally) a ``"lr_scheduler"``
-              key whose value is a single LR scheduler or lr_dict.
+                key whose value is a single LR scheduler or ``lr_dict``.
             - **Tuple of dictionaries** as described above, with an optional ``"frequency"`` key.
             - **None** - Fit will run without any optimizer.
 
-        Note:
-            The lr_dict is a dictionary which contains the scheduler and its associated configuration.
-            The default configuration is shown below.
+        The ``lr_dict`` is a dictionary which contains the scheduler and its associated configuration.
+        The default configuration is shown below.
 
-            .. code-block:: python
+        .. code-block:: python
 
-                lr_dict = {
-                    'scheduler': lr_scheduler, # The LR scheduler instance (required)
-                    # The unit of the scheduler's step size, could also be 'step'
-                    'interval': 'epoch',
-                    'frequency': 1, # The frequency of the scheduler
-                    'monitor': 'val_loss', # Metric for `ReduceLROnPlateau` to monitor
-                    'strict': True, # Whether to crash the training if `monitor` is not found
-                    'name': None, # Custom name for `LearningRateMonitor` to use
+            lr_dict = {
+                # REQUIRED: The scheduler instance
+                'scheduler': lr_scheduler,
+                # The unit of the scheduler's step size, could also be 'step'.
+                # 'epoch' updates the scheduler on epoch end whereas 'step'
+                # updates it after a optimizer update.
+                'interval': 'epoch',
+                # How many epochs/steps should pass between calls to
+                # `scheduler.step()`. 1 corresponds to updating the learning
+                # rate after every epoch/step.
+                'frequency': 1,
+                # Metric to to monitor for schedulers like `ReduceLROnPlateau`
+                'monitor': 'val_loss',
+                # If set to `True`, will enforce that the value specified 'monitor'
+                # is available when the scheduler is updated, thus stopping
+                # training if not found. If set to `False`, it will only produce a warning
+                'strict': True,
+                # If using the `LearningRateMonitor` callback to monitor the
+                # learning rate progress, this keyword can be used to specify
+                # a custom logged name
+                'name': None,
+            }
+
+        When there are schedulers in which the ``.step()`` method is conditioned on a value, such as the
+        :class:`torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler, Lightning requires that the ``lr_dict``
+        contains the keyword ``"monitor"`` set to the metric name that the scheduler should be conditioned on.
+
+        .. testcode::
+
+            # The ReduceLROnPlateau scheduler requires a monitor
+            def configure_optimizers(self):
+                optimizer = Adam(...)
+                return {
+                    'optimizer': optimizer,
+                    'lr_scheduler': {
+                        'scheduler': ReduceLROnPlateau(optimizer, ...),
+                        'monitor': 'metric_to_track',
+                    }
                 }
 
-            Only the ``"scheduler"`` key is required, the rest will be set to the defaults above.
+            # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler
+            def configure_optimizers(self):
+                optimizer1 = Adam(...)
+                optimizer2 = SGD(...)
+                scheduler1 = ReduceLROnPlateau(optimizer1, ...)
+                scheduler2 = LambdaLR(optimizer2, ...)
+                return (
+                    {
+                        'optimizer': optimizer1,
+                        'lr_scheduler': {
+                            'scheduler': scheduler1,
+                            'monitor': 'metric_to_track',
+                        }
+                    },
+                    {'optimizer': optimizer2, 'lr_scheduler': scheduler2}
+                )
+
+        Metrics can be made available to monitor by simply logging it using
+        ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
         Note:
             The ``frequency`` value specified in a dict along with the ``optimizer`` key is an int corresponding
@@ -1147,9 +1194,11 @@ class LightningModule(
             It should be given to none or to all of the optimizers.
             There is a difference between passing multiple optimizers in a list,
             and passing multiple optimizers in dictionaries with a frequency of 1:
-            In the former case, all optimizers will operate on the given batch in each optimization step.
-            In the latter, only one optimizer will operate on the given batch at every step.
-            This is different from the ``frequency`` value specified in the lr_dict mentioned below.
+
+                - In the former case, all optimizers will operate on the given batch in each optimization step.
+                - In the latter, only one optimizer will operate on the given batch at every step.
+
+            This is different from the ``frequency`` value specified in the ``lr_dict`` mentioned above.
 
             .. code-block:: python
 
@@ -1168,7 +1217,7 @@ class LightningModule(
 
         Examples::
 
-            # most cases
+            # most cases. no learning rate scheduler
             def configure_optimizers(self):
                 return Adam(self.parameters(), lr=1e-3)
 
@@ -1186,11 +1235,14 @@ class LightningModule(
                 return [gen_opt, dis_opt], [dis_sch]
 
             # example with step-based learning rate schedulers
+            # each optimizer has its own scheduler
             def configure_optimizers(self):
                 gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
                 dis_opt = Adam(self.model_dis.parameters(), lr=0.02)
-                gen_sch = {'scheduler': ExponentialLR(gen_opt, 0.99),
-                           'interval': 'step'}  # called after each training step
+                gen_sch = {
+                    'scheduler': ExponentialLR(gen_opt, 0.99),
+                    'interval': 'step'  # called after each training step
+                }
                 dis_sch = CosineAnnealing(dis_opt, T_max=10) # called every epoch
                 return [gen_opt, dis_opt], [gen_sch, dis_sch]