doctest for .rst files (#1511)
* add doctest to circleci * Revert "add doctest to circleci" This reverts commit c45b34ea911a81f87989f6c3a832b1e8d8c471c6. * Revert "Revert "add doctest to circleci"" This reverts commit 41fca97fdcfe1cf4f6bdb3bbba75d25fa3b11f70. * doctest docs rst files * Revert "doctest docs rst files" This reverts commit b4a2e83e3da5ed1909de500ec14b6b614527c07f. * doctest only rst * doctest debugging.rst * doctest apex * doctest callbacks * doctest early stopping * doctest for child modules * doctest experiment reporting * indentation * doctest fast training * doctest for hyperparams * doctests for lr_finder * doctests multi-gpu * more doctest * make doctest drone * fix label build error * update fast training * update invalid imports * fix problem with int device count * rebase stuff * wip * wip * wip * intro guide * add missing code block * circleci * logger import for doctest * test if doctest runs on drone * fix mnist download * also run install deps for building docs * install cmake * try sudo * hide output * try pip stuff * try to mock horovod * Tranfer -> Transfer * add torchvision to extras * revert pip stuff * mlflow file location * do not mock torch * torchvision * drone extra req. * try higher sphinx version * Revert "try higher sphinx version" This reverts commit 490ac28e46d6fd52352640dfdf0d765befa56988. * try coverage command * try coverage command * try undoc flag * newline * undo drone * report coverage * review Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * remove torchvision from extras * skip tests only if torchvision not available * fix testoutput torchvision Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
This commit is contained in:
parent
48e808c20e
commit
a6de1b8d75
|
@ -64,10 +64,13 @@ references:
|
|||
name: Make Documentation
|
||||
command: |
|
||||
# sudo apt-get install pandoc
|
||||
sudo apt-get update && sudo apt-get install -y cmake
|
||||
pip install -r requirements.txt --user
|
||||
sudo pip install -r docs/requirements.txt
|
||||
pip install -r requirements-extra.txt --user # for doctesting loggers etc.
|
||||
# sphinx-apidoc -o ./docs/source ./pytorch_lightning **/test_* --force --follow-links
|
||||
cd docs; make clean ; make html --debug --jobs 2 SPHINXOPTS="-W"
|
||||
cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
|
||||
make doctest; make coverage
|
||||
|
||||
jobs:
|
||||
|
||||
|
|
|
@ -35,9 +35,11 @@ steps:
|
|||
- apt-get update && apt-get install -y cmake
|
||||
- pip install -r requirements.txt --user -q
|
||||
- pip install -r ./tests/requirements-devel.txt --user -q
|
||||
#- pip install -r ./docs/requirements.txt --user -q
|
||||
- pip list
|
||||
- python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')"
|
||||
- coverage run --source pytorch_lightning -m py.test pytorch_lightning tests benchmarks -v --doctest-modules # --flake8
|
||||
#- cd docs; make doctest; make coverage
|
||||
- coverage report
|
||||
- codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
|
||||
- python tests/collect_env_details.py
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
16-bit training
|
||||
=================
|
||||
Lightning offers 16-bit training for CPUs, GPUs and TPUs.
|
||||
|
@ -38,7 +43,7 @@ Install apex
|
|||
Enable 16-bit
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# turn on 16-bit
|
||||
trainer = Trainer(amp_level='O1', precision=16)
|
||||
|
@ -50,7 +55,7 @@ TPU 16-bit
|
|||
----------
|
||||
16-bit on TPus is much simpler. To use 16-bit with TPUs set precision to 16 when using the tpu flag
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(num_tpu_cores=8, precision=32)
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.callbacks.base import Callback
|
||||
|
||||
.. role:: hidden
|
||||
:class: hidden-section
|
||||
|
||||
|
@ -18,21 +23,23 @@ An overall Lightning system should have:
|
|||
|
||||
Example:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
class MyPrintingCallback(Callback):
|
||||
|
||||
def on_init_start(self, trainer):
|
||||
print('Starting to init trainer!')
|
||||
|
||||
def on_init_end(self, trainer):
|
||||
print('trainer is init now')
|
||||
|
||||
def on_train_end(self, trainer, pl_module):
|
||||
print('do something when training ends')
|
||||
|
||||
trainer = Trainer(callbacks=[MyPrintingCallback()])
|
||||
|
||||
.. testoutput::
|
||||
|
||||
>>> import pytorch_lightning as pl
|
||||
>>> class MyPrintingCallback(pl.Callback):
|
||||
...
|
||||
... def on_init_start(self, trainer):
|
||||
... print('Starting to init trainer!')
|
||||
...
|
||||
... def on_init_end(self, trainer):
|
||||
... print('trainer is init now')
|
||||
...
|
||||
... def on_train_end(self, trainer, pl_module):
|
||||
... print('do something when training ends')
|
||||
...
|
||||
>>> trainer = pl.Trainer(callbacks=[MyPrintingCallback()])
|
||||
Starting to init trainer!
|
||||
trainer is init now
|
||||
|
||||
|
|
|
@ -1,3 +1,22 @@
|
|||
.. testsetup:: *
|
||||
|
||||
import torch
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.callbacks.base import Callback
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def train_dataloader():
|
||||
pass
|
||||
|
||||
def val_dataloader():
|
||||
pass
|
||||
|
||||
|
||||
Child Modules
|
||||
-------------
|
||||
Research projects tend to test different approaches to the same dataset.
|
||||
|
@ -7,13 +26,18 @@ For example, imagine we now want to train an Autoencoder to use as a feature ext
|
|||
Recall that `LitMNIST` already defines all the dataloading etc... The only things
|
||||
that change in the `Autoencoder` model are the init, forward, training, validation and test step.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class Encoder(torch.nn.Module):
|
||||
...
|
||||
pass
|
||||
|
||||
class Decoder(torch.nn.Module):
|
||||
pass
|
||||
|
||||
class AutoEncoder(LitMNIST):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.encoder = Encoder()
|
||||
self.decoder = Decoder()
|
||||
|
||||
|
@ -30,10 +54,10 @@ that change in the `Autoencoder` model are the init, forward, training, validati
|
|||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
return self._shared_eval(batch, batch_idx, 'val'):
|
||||
return self._shared_eval(batch, batch_idx, 'val')
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
return self._shared_eval(batch, batch_idx, 'test'):
|
||||
return self._shared_eval(batch, batch_idx, 'test')
|
||||
|
||||
def _shared_eval(self, batch, batch_idx, prefix):
|
||||
x, y = batch
|
||||
|
@ -43,6 +67,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati
|
|||
loss = F.nll_loss(logits, y)
|
||||
return {f'{prefix}_loss': loss}
|
||||
|
||||
|
||||
and we can train this using the same trainer
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -58,5 +83,3 @@ In this case, we want to use the `AutoEncoder` to extract image representations
|
|||
|
||||
some_images = torch.Tensor(32, 1, 28, 28)
|
||||
representations = autoencoder(some_images)
|
||||
|
||||
..
|
|
@ -309,7 +309,7 @@ for path_ipynb in glob.glob(os.path.join(PATH_ROOT, 'notebooks', '*.ipynb')):
|
|||
# https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule
|
||||
|
||||
MOCK_REQUIRE_PACKAGES = []
|
||||
with open(os.path.join(PATH_ROOT, 'requirements.txt'), 'r') as fp:
|
||||
with open(os.path.join(PATH_ROOT, 'requirements-extra.txt'), 'r') as fp:
|
||||
for ln in fp.readlines():
|
||||
found = [ln.index(ch) for ch in list(',=<>#') if ch in ln]
|
||||
pkg = ln[:min(found)] if found else ln
|
||||
|
@ -318,19 +318,10 @@ with open(os.path.join(PATH_ROOT, 'requirements.txt'), 'r') as fp:
|
|||
|
||||
# TODO: better parse from package since the import name and package name may differ
|
||||
MOCK_MANUAL_PACKAGES = [
|
||||
'torch',
|
||||
'torchvision',
|
||||
'PIL',
|
||||
'test_tube',
|
||||
'mlflow',
|
||||
'comet_ml',
|
||||
'wandb',
|
||||
'neptune',
|
||||
'trains',
|
||||
]
|
||||
autodoc_mock_imports = MOCK_REQUIRE_PACKAGES + MOCK_MANUAL_PACKAGES
|
||||
# for mod_name in MOCK_REQUIRE_PACKAGES:
|
||||
# sys.modules[mod_name] = mock.Mock()
|
||||
|
||||
|
||||
# Options for the linkcode extension
|
||||
|
@ -405,3 +396,16 @@ html_add_permalinks = "¶"
|
|||
# Useful for avoiding ambiguity when the same section heading appears in different documents.
|
||||
# http://www.sphinx-doc.org/en/master/usage/extensions/autosectionlabel.html
|
||||
autosectionlabel_prefix_document = True
|
||||
|
||||
# only run doctests marked with a ".. doctest::" directive
|
||||
doctest_test_doctest_blocks = ''
|
||||
doctest_global_setup = """
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import torch
|
||||
|
||||
TORCHVISION_AVAILABLE = importlib.util.find_spec('torchvision')
|
||||
|
||||
"""
|
||||
coverage_skip_undoc_in_source = True
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
Debugging
|
||||
=========
|
||||
The following are flags that make debugging much easier.
|
||||
|
@ -11,9 +15,9 @@ a full epoch to crash.
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.fast_dev_run`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
trainer = pl.Trainer(fast_dev_run=True)
|
||||
trainer = Trainer(fast_dev_run=True)
|
||||
|
||||
Inspect gradient norms
|
||||
----------------------
|
||||
|
@ -22,10 +26,10 @@ Logs (to a logger), the norm of each weight matrix.
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.track_grad_norm`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# the 2-norm
|
||||
trainer = pl.Trainer(track_grad_norm=2)
|
||||
trainer = Trainer(track_grad_norm=2)
|
||||
|
||||
Log GPU usage
|
||||
-------------
|
||||
|
@ -34,9 +38,9 @@ Logs (to a logger) the GPU usage for each GPU on the master machine.
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.log_gpu_memory`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
trainer = pl.Trainer(log_gpu_memory=True)
|
||||
trainer = Trainer(log_gpu_memory=True)
|
||||
|
||||
Make model overfit on subset of data
|
||||
------------------------------------
|
||||
|
@ -47,9 +51,9 @@ and try to get your model to overfit. If it can't, it's a sign it won't work wit
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.overfit_pct`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
trainer = pl.Trainer(overfit_pct=0.01)
|
||||
trainer = Trainer(overfit_pct=0.01)
|
||||
|
||||
Print the parameter count by layer
|
||||
----------------------------------
|
||||
|
@ -59,9 +63,9 @@ To disable this behavior, turn off this flag:
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_summary`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
trainer = pl.Trainer(weights_summary=None)
|
||||
trainer = Trainer(weights_summary=None)
|
||||
|
||||
|
||||
Set the number of validation sanity steps
|
||||
|
@ -72,7 +76,7 @@ This avoids crashing in the validation loop sometime deep into a lengthy trainin
|
|||
(See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.num_sanity_val_steps`
|
||||
argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(num_sanity_val_steps=5)
|
|
@ -1,3 +1,9 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
|
||||
|
||||
|
||||
Early stopping
|
||||
==============
|
||||
|
||||
|
@ -17,23 +23,25 @@ Enable Early Stopping using Callbacks on epoch end
|
|||
--------------------------------------------------
|
||||
There are two ways to enable early stopping using callbacks on epoch end.
|
||||
|
||||
.. doctest::
|
||||
- Set early_stop_callback to True. Will look for 'val_loss' in validation_epoch_end() return dict.
|
||||
If it is not found an error is raised.
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.callbacks import EarlyStopping
|
||||
.. testcode::
|
||||
|
||||
# A) Set early_stop_callback to True. Will look for 'val_loss'
|
||||
# in validation_epoch_end() return dict. If it is not found an error is raised.
|
||||
>>> trainer = Trainer(early_stop_callback=True)
|
||||
# B) Or configure your own callback
|
||||
>>> early_stop_callback = EarlyStopping(
|
||||
... monitor='val_loss',
|
||||
... min_delta=0.00,
|
||||
... patience=3,
|
||||
... verbose=False,
|
||||
... mode='min'
|
||||
... )
|
||||
>>> trainer = Trainer(early_stop_callback=early_stop_callback)
|
||||
trainer = Trainer(early_stop_callback=True)
|
||||
|
||||
- Or configure your own callback
|
||||
|
||||
.. testcode::
|
||||
|
||||
early_stop_callback = EarlyStopping(
|
||||
monitor='val_loss',
|
||||
min_delta=0.00,
|
||||
patience=3,
|
||||
verbose=False,
|
||||
mode='min'
|
||||
)
|
||||
trainer = Trainer(early_stop_callback=early_stop_callback)
|
||||
|
||||
In any case, the callback will fall back to the training metrics (returned in
|
||||
:meth:`~pytorch_lightning.core.lightning.LightningModule.training_step`,
|
||||
|
@ -43,7 +51,8 @@ looking for a key to monitor if validation is disabled or
|
|||
is not defined.
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
- :class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
- :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
|
||||
|
||||
Disable Early Stopping with callbacks on epoch end
|
||||
--------------------------------------------------
|
||||
|
@ -53,4 +62,5 @@ Note that ``None`` will not disable early stopping but will lead to the
|
|||
default behaviour.
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
- :class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
- :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
|
||||
Experiment Logging
|
||||
==================
|
||||
|
||||
|
@ -14,31 +20,29 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> import os
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.loggers import CometLogger
|
||||
>>> comet_logger = CometLogger(
|
||||
... api_key=os.environ.get('COMET_API_KEY'),
|
||||
... workspace=os.environ.get('COMET_WORKSPACE'), # Optional
|
||||
... save_dir='.', # Optional
|
||||
... project_name='default_project', # Optional
|
||||
... rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional
|
||||
... experiment_name='default' # Optional
|
||||
... )
|
||||
>>> trainer = Trainer(logger=comet_logger)
|
||||
import os
|
||||
from pytorch_lightning.loggers import CometLogger
|
||||
comet_logger = CometLogger(
|
||||
api_key=os.environ.get('COMET_API_KEY'),
|
||||
workspace=os.environ.get('COMET_WORKSPACE'), # Optional
|
||||
save_dir='.', # Optional
|
||||
project_name='default_project', # Optional
|
||||
rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional
|
||||
experiment_name='default' # Optional
|
||||
)
|
||||
trainer = Trainer(logger=comet_logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.CometLogger` is available anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.CometLogger` docs.
|
||||
|
@ -56,15 +60,14 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.loggers import MLFlowLogger
|
||||
>>> mlf_logger = MLFlowLogger(
|
||||
... experiment_name="default",
|
||||
... tracking_uri="file:/."
|
||||
... )
|
||||
>>> trainer = Trainer(logger=mlf_logger)
|
||||
from pytorch_lightning.loggers import MLFlowLogger
|
||||
mlf_logger = MLFlowLogger(
|
||||
experiment_name="default",
|
||||
tracking_uri="file:./ml-runs"
|
||||
)
|
||||
trainer = Trainer(logger=mlf_logger)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.MLFlowLogger` docs.
|
||||
|
@ -82,29 +85,27 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.loggers import NeptuneLogger
|
||||
>>> neptune_logger = NeptuneLogger(
|
||||
... api_key='ANONYMOUS', # replace with your own
|
||||
... project_name='shared/pytorch-lightning-integration',
|
||||
... experiment_name='default', # Optional,
|
||||
... params={'max_epochs': 10}, # Optional,
|
||||
... tags=['pytorch-lightning', 'mlp'], # Optional,
|
||||
... )
|
||||
>>> trainer = Trainer(logger=neptune_logger)
|
||||
from pytorch_lightning.loggers import NeptuneLogger
|
||||
neptune_logger = NeptuneLogger(
|
||||
api_key='ANONYMOUS', # replace with your own
|
||||
project_name='shared/pytorch-lightning-integration',
|
||||
experiment_name='default', # Optional,
|
||||
params={'max_epochs': 10}, # Optional,
|
||||
tags=['pytorch-lightning', 'mlp'], # Optional,
|
||||
)
|
||||
trainer = Trainer(logger=neptune_logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.NeptuneLogger` is available anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.NeptuneLogger` docs.
|
||||
|
@ -122,28 +123,31 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
from pytorch_lightning.loggers import TrainsLogger
|
||||
trains_logger = TrainsLogger(
|
||||
project_name='examples',
|
||||
task_name='pytorch lightning test',
|
||||
)
|
||||
trainer = Trainer(logger=trains_logger)
|
||||
|
||||
.. testoutput::
|
||||
:options: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
:hide:
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.loggers import TrainsLogger
|
||||
>>> trains_logger = TrainsLogger(
|
||||
... project_name='examples',
|
||||
... task_name='pytorch lightning test',
|
||||
... ) # doctest: +ELLIPSIS
|
||||
TRAINS Task: ...
|
||||
TRAINS results page: ...
|
||||
>>> trainer = Trainer(logger=trains_logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.TrainsLogger` is available anywhere in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def __init__(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def __init__(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.TrainsLogger` docs.
|
||||
|
@ -153,23 +157,21 @@ Tensorboard
|
|||
|
||||
To use `TensorBoard <https://pytorch.org/docs/stable/tensorboard.html>`_ as your logger do the following.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.loggers import TensorBoardLogger
|
||||
>>> logger = TensorBoardLogger('tb_logs', name='my_model')
|
||||
>>> trainer = Trainer(logger=logger)
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
logger = TensorBoardLogger('tb_logs', name='my_model')
|
||||
trainer = Trainer(logger=logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.TensorBoardLogger` is available anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.TensorBoardLogger` docs.
|
||||
|
@ -188,22 +190,21 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning.loggers import TestTubeLogger
|
||||
>>> logger = TestTubeLogger('tb_logs', name='my_model')
|
||||
>>> trainer = Trainer(logger=logger)
|
||||
from pytorch_lightning.loggers import TestTubeLogger
|
||||
logger = TestTubeLogger('tb_logs', name='my_model')
|
||||
trainer = Trainer(logger=logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.add_image('generated_images', some_img, 0)
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.TestTubeLogger` docs.
|
||||
|
@ -221,24 +222,23 @@ First, install the package:
|
|||
|
||||
Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning.loggers import WandbLogger
|
||||
>>> wandb_logger = WandbLogger()
|
||||
>>> trainer = Trainer(logger=wandb_logger)
|
||||
from pytorch_lightning.loggers import WandbLogger
|
||||
wandb_logger = WandbLogger()
|
||||
trainer = Trainer(logger=wandb_logger)
|
||||
|
||||
The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... self.logger.experiment.log({
|
||||
... "generated_images": [wandb.Image(some_img, caption="...")]
|
||||
... })
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
self.logger.experiment.log({
|
||||
"generated_images": [wandb.Image(some_img, caption="...")]
|
||||
})
|
||||
|
||||
.. seealso::
|
||||
:class:`~pytorch_lightning.loggers.WandbLogger` docs.
|
||||
|
@ -249,23 +249,22 @@ Multiple Loggers
|
|||
Lightning supports the use of multiple loggers, just pass a list to the
|
||||
:class:`~pytorch_lightning.trainer.trainer.Trainer`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
|
||||
>>> logger1 = TensorBoardLogger('tb_logs', name='my_model')
|
||||
>>> logger2 = TestTubeLogger('tb_logs', name='my_model')
|
||||
>>> trainer = Trainer(logger=[logger1, logger2])
|
||||
from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
|
||||
logger1 = TensorBoardLogger('tb_logs', name='my_model')
|
||||
logger2 = TestTubeLogger('tb_logs', name='my_model')
|
||||
trainer = Trainer(logger=[logger1, logger2])
|
||||
|
||||
The loggers are available as a list anywhere except ``__init__`` in your
|
||||
:class:`~pytorch_lightning.core.lightning.LightningModule`.
|
||||
|
||||
.. doctest::
|
||||
.. testcode::
|
||||
|
||||
>>> from pytorch_lightning import LightningModule
|
||||
>>> class MyModule(LightningModule):
|
||||
... def any_lightning_module_function_or_hook(self):
|
||||
... some_img = fake_image()
|
||||
... # Option 1
|
||||
... self.logger.experiment[0].add_image('generated_images', some_img, 0)
|
||||
... # Option 2
|
||||
... self.logger[0].experiment.add_image('generated_images', some_img, 0)
|
||||
class MyModule(LightningModule):
|
||||
def any_lightning_module_function_or_hook(self):
|
||||
some_img = fake_image()
|
||||
# Option 1
|
||||
self.logger.experiment[0].add_image('generated_images', some_img, 0)
|
||||
# Option 2
|
||||
self.logger[0].experiment.add_image('generated_images', some_img, 0)
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
Experiment Reporting
|
||||
=====================
|
||||
|
||||
|
@ -11,10 +16,10 @@ Control logging frequency
|
|||
|
||||
It may slow training down to log every single batch. Trainer has an option to log every k batches instead.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# k = 10
|
||||
Trainer(row_log_interval=10)
|
||||
k = 10
|
||||
trainer = Trainer(row_log_interval=k)
|
||||
|
||||
Control log writing frequency
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -25,10 +30,10 @@ want to log using this trainer flag.
|
|||
.. seealso::
|
||||
:class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
k = 100
|
||||
Trainer(log_save_interval=k)
|
||||
k = 100
|
||||
trainer = Trainer(log_save_interval=k)
|
||||
|
||||
Log metrics
|
||||
^^^^^^^^^^^
|
||||
|
@ -37,46 +42,47 @@ To plot metrics into whatever logger you passed in (tensorboard, comet, neptune,
|
|||
|
||||
1. training_epoch_end, validation_epoch_end, test_epoch_end will all log anything in the "log" key of the return dict.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
def training_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
|
||||
logs = {'train_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
logs = {'train_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
def validation_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
|
||||
logs = {'val_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
logs = {'val_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
def test_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
|
||||
logs = {'test_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
logs = {'test_loss': loss}
|
||||
results = {'log': logs}
|
||||
return results
|
||||
|
||||
2. In addition, you can also use any arbitrary functionality from a particular logger from within your LightningModule.
|
||||
For instance, here we log images using tensorboard.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: not TORCHVISION_AVAILABLE
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.generated_imgs = self.decoder.generate()
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.generated_imgs = self.decoder.generate()
|
||||
|
||||
sample_imgs = self.generated_imgs[:6]
|
||||
grid = torchvision.utils.make_grid(sample_imgs)
|
||||
self.logger.experiment.add_image('generated_images', grid, 0)
|
||||
sample_imgs = self.generated_imgs[:6]
|
||||
grid = torchvision.utils.make_grid(sample_imgs)
|
||||
self.logger.experiment.add_image('generated_images', grid, 0)
|
||||
|
||||
...
|
||||
return results
|
||||
...
|
||||
return results
|
||||
|
||||
Modify progress bar
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -86,15 +92,15 @@ a key called "progress_bar".
|
|||
|
||||
Here we show the validation loss in the progress bar
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
def validation_epoch_end(self, outputs):
|
||||
loss = some_loss()
|
||||
...
|
||||
|
||||
logs = {'val_loss': loss}
|
||||
results = {'progress_bar': logs}
|
||||
return results
|
||||
logs = {'val_loss': loss}
|
||||
results = {'progress_bar': logs}
|
||||
return results
|
||||
|
||||
Snapshot hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -103,8 +109,8 @@ When Lightning creates a checkpoint, it stores a key "hparams" with the hyperpar
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage)
|
||||
hyperparams = lightning_checkpoint['hparams']
|
||||
lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage)
|
||||
hyperparams = lightning_checkpoint['hparams']
|
||||
|
||||
Some loggers also allow logging the hyperparams used in the experiment. For instance,
|
||||
when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show
|
||||
|
@ -115,8 +121,7 @@ Snapshot code
|
|||
Loggers also allow you to snapshot a copy of the code used in this experiment.
|
||||
For example, TestTubeLogger does this with a flag:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
from pytorch_lightning.loggers import TestTubeLogger
|
||||
|
||||
logger = TestTubeLogger(create_git_tag=True)
|
||||
from pytorch_lightning.loggers import TestTubeLogger
|
||||
logger = TestTubeLogger('.', create_git_tag=True)
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
Fast Training
|
||||
=============
|
||||
There are multiple options to speed up different parts of the training by choosing to train
|
||||
|
@ -7,7 +12,7 @@ Check validation every n epochs
|
|||
-------------------------------
|
||||
If you have a small dataset you might want to check validation every n epochs
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(check_val_every_n_epoch=1)
|
||||
|
@ -19,7 +24,7 @@ It can be useful to force training for a minimum number of epochs or limit to a
|
|||
.. seealso::
|
||||
:class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(min_epochs=1, max_epochs=1000)
|
||||
|
@ -31,7 +36,7 @@ For large datasets it's often desirable to check validation multiple times withi
|
|||
Pass in a float to check that often within 1 training epoch. Pass in an int k to check every k training batches.
|
||||
Must use an int if using an IterableDataset.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(val_check_interval=0.95)
|
||||
|
@ -46,21 +51,21 @@ Use data subset for training, validation and test
|
|||
-------------------------------------------------
|
||||
If you don't want to check 100% of the training/validation/test set (for debugging or if it's huge), set these flags.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT
|
||||
trainer = Trainer(
|
||||
train_percent_check=1.0,
|
||||
val_percent_check=1.0,
|
||||
test_percent_check=1.0
|
||||
)
|
||||
# DEFAULT
|
||||
trainer = Trainer(
|
||||
train_percent_check=1.0,
|
||||
val_percent_check=1.0,
|
||||
test_percent_check=1.0
|
||||
)
|
||||
|
||||
# check 10%, 20%, 30% only, respectively for training, validation and test set
|
||||
trainer = Trainer(
|
||||
train_percent_check=0.1,
|
||||
val_percent_check=0.2,
|
||||
test_percent_check=0.3
|
||||
)
|
||||
# check 10%, 20%, 30% only, respectively for training, validation and test set
|
||||
trainer = Trainer(
|
||||
train_percent_check=0.1,
|
||||
val_percent_check=0.2,
|
||||
test_percent_check=0.3
|
||||
)
|
||||
|
||||
.. note:: ``train_percent_check``, ``val_percent_check`` and ``test_percent_check`` will be overwritten by ``overfit_pct`` if ``overfit_pct`` > 0. ``val_percent_check`` will be ignored if ``fast_dev_run=True``.
|
||||
|
||||
|
|
|
@ -1,3 +1,13 @@
|
|||
.. testsetup:: *
|
||||
|
||||
import torch
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
import sys
|
||||
sys.argv = ['foo']
|
||||
|
||||
|
||||
Hyperparameters
|
||||
---------------
|
||||
Lightning has utilities to interact seamlessly with the command line ArgumentParser
|
||||
|
@ -7,13 +17,11 @@ ArgumentParser
|
|||
^^^^^^^^^^^^^^
|
||||
Lightning is designed to augment a lot of the functionality of the built-in Python ArgumentParser
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--layer_1_dim', type=int, default=128)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
This allows you to call your program like so:
|
||||
|
@ -35,9 +43,9 @@ We can do this as follows. First, in your LightningModule, define the arguments
|
|||
specific to that module. Remember that data splits or data paths may also be specific to
|
||||
a module (ie: if your project has a model that trains on Imagenet and another on CIFAR-10).
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitModel(LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
|
@ -48,13 +56,12 @@ a module (ie: if your project has a model that trains on Imagenet and another on
|
|||
|
||||
Now in your main trainer file, add the Trainer args, the program args, and add the model args
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# ----------------
|
||||
# trainer_main.py
|
||||
# ----------------
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser()
|
||||
|
||||
# add PROGRAM level args
|
||||
|
@ -66,7 +73,7 @@ Now in your main trainer file, add the Trainer args, the program args, and add t
|
|||
|
||||
# add all the available trainer options to argparse
|
||||
# ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
|
||||
parser = pl.Trainer.add_argparse_args(parser)
|
||||
parser = Trainer.add_argparse_args(parser)
|
||||
|
||||
hparams = parser.parse_args()
|
||||
|
||||
|
@ -78,9 +85,7 @@ Now you can call run your program like so
|
|||
|
||||
Finally, make sure to start the training like so:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
hparams = parser.parse_args()
|
||||
.. code-block:: python
|
||||
|
||||
# YES
|
||||
model = LitModel(hparams)
|
||||
|
@ -88,59 +93,56 @@ Finally, make sure to start the training like so:
|
|||
|
||||
# NO
|
||||
# model = LitModel(learning_rate=hparams.learning_rate, ...)
|
||||
#trainer = Trainer(gpus=hparams.gpus, ...)
|
||||
# trainer = Trainer(gpus=hparams.gpus, ...)
|
||||
|
||||
|
||||
LightiningModule hparams
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
LightningModule hparams
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Normally, we don't hard-code the values to a model. We usually use the command line to
|
||||
modify the network and read those values in the LightningModule
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def __init__(self, hparams):
|
||||
super().__init__()
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
# do this to save all arguments in any logger (tensorboard)
|
||||
self.hparams = hparams
|
||||
def __init__(self, hparams):
|
||||
super().__init__()
|
||||
|
||||
self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
|
||||
self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim)
|
||||
self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10)
|
||||
# do this to save all arguments in any logger (tensorboard)
|
||||
self.hparams = hparams
|
||||
|
||||
def forward(self, x):
|
||||
...
|
||||
self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
|
||||
self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim)
|
||||
self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10)
|
||||
|
||||
def train_dataloader(self):
|
||||
...
|
||||
return DataLoader(mnist_train, batch_size=self.hparams.batch_size)
|
||||
def train_dataloader(self):
|
||||
return DataLoader(mnist_train, batch_size=self.hparams.batch_size)
|
||||
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=self.hparams.learning_rate)
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=self.hparams.learning_rate)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
parser = ArgumentParser(parents=[parent_parser], add_help=False)
|
||||
|
||||
parser.add_argument('--layer_1_dim', type=int, default=128)
|
||||
parser.add_argument('--layer_2_dim', type=int, default=256)
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--learning_rate', type=float, default=0.002)
|
||||
return parser
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
parser = ArgumentParser(parents=[parent_parser], add_help=False)
|
||||
parser.add_argument('--layer_1_dim', type=int, default=128)
|
||||
parser.add_argument('--layer_2_dim', type=int, default=256)
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--learning_rate', type=float, default=0.002)
|
||||
return parser
|
||||
|
||||
Now pass in the params when you init your model
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
hparams = parse_args()
|
||||
parser = ArgumentParser()
|
||||
parser = LitMNIST.add_model_specific_args(parser)
|
||||
hparams = parser.parse_args()
|
||||
model = LitMNIST(hparams)
|
||||
|
||||
The line `self.hparams = hparams` is very special. This line assigns your hparams to the LightningModule.
|
||||
This does two things:
|
||||
|
||||
1. It adds them automatically to tensorboard logs under the hparams tab.
|
||||
1. It adds them automatically to TensorBoard logs under the hparams tab.
|
||||
2. Lightning will save those hparams to the checkpoint and use them to restore the module correctly.
|
||||
|
||||
Trainer args
|
||||
|
@ -165,9 +167,10 @@ Multiple Lightning Modules
|
|||
We often have multiple Lightning Modules where each one has different arguments. Instead of
|
||||
polluting the main.py file, the LightningModule lets you define arguments for each one.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def __init__(self, hparams):
|
||||
super().__init__()
|
||||
self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
|
||||
|
@ -178,7 +181,10 @@ polluting the main.py file, the LightningModule lets you define arguments for ea
|
|||
parser.add_argument('--layer_1_dim', type=int, default=128)
|
||||
return parser
|
||||
|
||||
class GoodGAN(pl.LightningModule):
|
||||
.. testcode::
|
||||
|
||||
class GoodGAN(LightningModule):
|
||||
|
||||
def __init__(self, hparams):
|
||||
super().__init__()
|
||||
self.encoder = Encoder(layers=hparams.encoder_layers)
|
||||
|
@ -189,7 +195,8 @@ polluting the main.py file, the LightningModule lets you define arguments for ea
|
|||
parser.add_argument('--encoder_layers', type=int, default=12)
|
||||
return parser
|
||||
|
||||
Now we can allow each model to inject the arguments it needs in the main.py
|
||||
|
||||
Now we can allow each model to inject the arguments it needs in the ``main.py``
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -226,7 +233,7 @@ Now we can allow each model to inject the arguments it needs in the main.py
|
|||
# train
|
||||
main(args)
|
||||
|
||||
and now we can train MNIST or the gan using the command line interface!
|
||||
and now we can train MNIST or the GAN using the command line interface!
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
Introduction Guide
|
||||
==================
|
||||
PyTorch Lightning provides a very simple template for organizing your PyTorch code. Once
|
||||
|
@ -126,14 +132,14 @@ The LightningModule provides the structure on how to organize these 5 ingredient
|
|||
Let's first start with the model. In this case we'll design
|
||||
a 3-layer neural network.
|
||||
|
||||
.. code-block:: default
|
||||
.. testcode::
|
||||
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from torch import nn
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
@ -169,7 +175,7 @@ Notice this is a `LightningModule` instead of a `torch.nn.Module`. A LightningMo
|
|||
equivalent to a PyTorch Module except it has added functionality. However, you can use it
|
||||
EXACTLY the same as you would a PyTorch Module.
|
||||
|
||||
.. code-block:: default
|
||||
.. testcode::
|
||||
|
||||
net = LitMNIST()
|
||||
x = torch.Tensor(1, 1, 28, 28)
|
||||
|
@ -189,14 +195,14 @@ Data
|
|||
The Lightning Module organizes your dataloaders and data processing as well.
|
||||
Here's the PyTorch code for loading MNIST
|
||||
|
||||
.. code-block:: default
|
||||
.. testcode::
|
||||
:skipif: not TORCHVISION_AVAILABLE
|
||||
|
||||
from torch.utils.data import DataLoader, random_split
|
||||
from torchvision.datasets import MNIST
|
||||
import os
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
|
||||
# transforms
|
||||
# prepare transforms standard to MNIST
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
|
@ -206,24 +212,38 @@ Here's the PyTorch code for loading MNIST
|
|||
mnist_train = MNIST(os.getcwd(), train=True, download=True)
|
||||
mnist_train = DataLoader(mnist_train, batch_size=64)
|
||||
|
||||
.. testoutput::
|
||||
:hide:
|
||||
:skipif: os.path.isdir(os.path.join(os.getcwd(), 'MNIST')) or not TORCHVISION_AVAILABLE
|
||||
|
||||
Downloading ...
|
||||
Extracting ...
|
||||
Downloading ...
|
||||
Extracting ...
|
||||
Downloading ...
|
||||
Extracting ...
|
||||
Processing...
|
||||
Done!
|
||||
|
||||
When using PyTorch Lightning, we use the exact same code except we organize it into
|
||||
the LightningModule
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: not TORCHVISION_AVAILABLE
|
||||
|
||||
from torch.utils.data import DataLoader, random_split
|
||||
from torchvision.datasets import MNIST
|
||||
import os
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def train_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
def train_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
|
||||
Notice the code is exactly the same, except now the training dataloading has been organized by the LightningModule
|
||||
under the `train_dataloader` method. This is great because if you run into a project that uses Lightning and want
|
||||
|
@ -232,21 +252,21 @@ to figure out how they prepare their training data you can just look in the `tra
|
|||
Usually though, we want to separate the things that write to disk in data-processing from
|
||||
things like transforms which happen in memory.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def prepare_data(self):
|
||||
# download only
|
||||
MNIST(os.getcwd(), train=True, download=True)
|
||||
def prepare_data(self):
|
||||
# download only
|
||||
MNIST(os.getcwd(), train=True, download=True)
|
||||
|
||||
def train_dataloader(self):
|
||||
# no download, just transform
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
def train_dataloader(self):
|
||||
# no download, just transform
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
|
||||
Doing it in the `prepare_data` method ensures that when you have
|
||||
multiple GPUs you won't overwrite the data. This is a contrived example
|
||||
|
@ -254,24 +274,24 @@ but it gets more complicated with things like NLP or Imagenet.
|
|||
|
||||
In general fill these methods with the following:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def prepare_data(self):
|
||||
# stuff here is done once at the very beginning of training
|
||||
# before any distributed training starts
|
||||
|
||||
# download stuff
|
||||
# save to disk
|
||||
# etc...
|
||||
|
||||
def train_dataloader(self):
|
||||
# data transforms
|
||||
# dataset creation
|
||||
# return a DataLoader
|
||||
def prepare_data(self):
|
||||
# stuff here is done once at the very beginning of training
|
||||
# before any distributed training starts
|
||||
|
||||
# download stuff
|
||||
# save to disk
|
||||
# etc...
|
||||
...
|
||||
|
||||
def train_dataloader(self):
|
||||
# data transforms
|
||||
# dataset creation
|
||||
# return a DataLoader
|
||||
...
|
||||
|
||||
Optimizer
|
||||
^^^^^^^^^
|
||||
|
@ -287,20 +307,20 @@ In PyTorch we do it as follows:
|
|||
|
||||
In Lightning we do the same but organize it under the configure_optimizers method.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=1e-3)
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=1e-3)
|
||||
|
||||
.. note:: The LightningModule itself has the parameters, so pass in self.parameters()
|
||||
|
||||
However, if you have multiple optimizers use the matching parameters
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.generator(), lr=1e-3), Adam(self.discriminator(), lr=1e-3)
|
||||
|
@ -340,16 +360,16 @@ In the case of MNIST we do the following
|
|||
In Lightning, everything that is in the training step gets organized under the `training_step` function
|
||||
in the LightningModule
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'loss': loss}
|
||||
# return loss (also works)
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'loss': loss}
|
||||
# return loss (also works)
|
||||
|
||||
Again, this is the same PyTorch code except that it has been organized by the LightningModule.
|
||||
This code is not restricted which means it can be as complicated as a full seq-2-seq, RL loop, GAN, etc...
|
||||
|
@ -367,43 +387,43 @@ So far we defined 4 key ingredients in pure PyTorch but organized the code insid
|
|||
|
||||
For clarity, we'll recall that the full LightningModule now looks like this.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer_1 = torch.nn.Linear(28 * 28, 128)
|
||||
self.layer_2 = torch.nn.Linear(128, 256)
|
||||
self.layer_3 = torch.nn.Linear(256, 10)
|
||||
class LitMNIST(LightningModule):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer_1 = torch.nn.Linear(28 * 28, 128)
|
||||
self.layer_2 = torch.nn.Linear(128, 256)
|
||||
self.layer_3 = torch.nn.Linear(256, 10)
|
||||
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_2(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_3(x)
|
||||
x = torch.log_softmax(x, dim=1)
|
||||
return x
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_2(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_3(x)
|
||||
x = torch.log_softmax(x, dim=1)
|
||||
return x
|
||||
|
||||
def train_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
def train_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
|
||||
return DataLoader(mnist_train, batch_size=64)
|
||||
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=1e-3)
|
||||
def configure_optimizers(self):
|
||||
return Adam(self.parameters(), lr=1e-3)
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
|
||||
# add logging
|
||||
logs = {'loss': loss}
|
||||
return {'loss': loss, 'log': logs}
|
||||
# add logging
|
||||
logs = {'loss': loss}
|
||||
return {'loss': loss, 'log': logs}
|
||||
|
||||
Again, this is the same PyTorch code, except that it's organized
|
||||
by the LightningModule. This organization now lets us train this model
|
||||
|
@ -551,33 +571,33 @@ will cause all sorts of issues.
|
|||
To solve this problem, move the download code to the `prepare_data` method in the LightningModule.
|
||||
In this method we do all the preparation we need to do once (instead of on every gpu).
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def prepare_data(self):
|
||||
# transform
|
||||
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
|
||||
class LitMNIST(LightningModule):
|
||||
def prepare_data(self):
|
||||
# transform
|
||||
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
|
||||
|
||||
# download
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
|
||||
mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
|
||||
# download
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
|
||||
mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
|
||||
|
||||
# train/val split
|
||||
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
# train/val split
|
||||
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
|
||||
# assign to use in dataloaders
|
||||
self.train_dataset = mnist_train
|
||||
self.val_dataset = mnist_val
|
||||
self.test_dataset = mnist_test
|
||||
# assign to use in dataloaders
|
||||
self.train_dataset = mnist_train
|
||||
self.val_dataset = mnist_val
|
||||
self.test_dataset = mnist_test
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.train_dataset, batch_size=64)
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.train_dataset, batch_size=64)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=64)
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=64)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=64)
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=64)
|
||||
|
||||
The `prepare_data` method is also a good place to do any data processing that needs to be done only
|
||||
once (ie: download or tokenize, etc...).
|
||||
|
@ -642,28 +662,28 @@ In addition, we define a `val_dataloader` method which tells the trainer what da
|
|||
Notice we split the train split of MNIST into train, validation. We also have to make sure to do the
|
||||
sample split in the `train_dataloader` method.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'val_loss': loss}
|
||||
class LitMNIST(LightningModule):
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'val_loss': loss}
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
||||
tensorboard_logs = {'val_loss': avg_loss}
|
||||
return {'val_loss': avg_loss, 'log': tensorboard_logs}
|
||||
def validation_epoch_end(self, outputs):
|
||||
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
||||
tensorboard_logs = {'val_loss': avg_loss}
|
||||
return {'val_loss': avg_loss, 'log': tensorboard_logs}
|
||||
|
||||
def val_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
_, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
mnist_val = DataLoader(mnist_val, batch_size=64)
|
||||
return mnist_val
|
||||
def val_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=True, download=False,
|
||||
transform=transform)
|
||||
_, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
mnist_val = DataLoader(mnist_val, batch_size=64)
|
||||
return mnist_val
|
||||
|
||||
Again, we've just organized the regular PyTorch code into two steps, the `validation_step` method which
|
||||
operates on a single batch and the `validation_epoch_end` method to compute statistics on all batches.
|
||||
|
@ -698,26 +718,26 @@ Just like the validation loop, we define exactly the same steps for testing:
|
|||
- test_epoch_end
|
||||
- test_dataloader
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'val_loss': loss}
|
||||
class LitMNIST(LightningModule):
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return {'val_loss': loss}
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
||||
tensorboard_logs = {'val_loss': avg_loss}
|
||||
return {'val_loss': avg_loss, 'log': tensorboard_logs}
|
||||
def test_epoch_end(self, outputs):
|
||||
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
||||
tensorboard_logs = {'val_loss': avg_loss}
|
||||
return {'val_loss': avg_loss, 'log': tensorboard_logs}
|
||||
|
||||
def test_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform)
|
||||
_, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
mnist_val = DataLoader(mnist_val, batch_size=64)
|
||||
return mnist_val
|
||||
def test_dataloader(self):
|
||||
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
|
||||
mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform)
|
||||
_, mnist_val = random_split(mnist_train, [55000, 5000])
|
||||
mnist_val = DataLoader(mnist_val, batch_size=64)
|
||||
return mnist_val
|
||||
|
||||
However, to make sure the test set isn't used inadvertently, Lightning has a separate API to run tests.
|
||||
Once you train your model simply call `.test()`.
|
||||
|
@ -773,26 +793,26 @@ On the surface, it looks like `forward` and `training_step` are similar. General
|
|||
what we want the model to do is what happens in the `forward`. whereas the `training_step` likely calls forward from
|
||||
within it.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class MNISTClassifier(pl.LightningModule):
|
||||
class MNISTClassifier(LightningModule):
|
||||
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_2(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_3(x)
|
||||
x = torch.log_softmax(x, dim=1)
|
||||
return x
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_2(x)
|
||||
x = torch.relu(x)
|
||||
x = self.layer_3(x)
|
||||
x = torch.log_softmax(x, dim=1)
|
||||
return x
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return loss
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
return loss
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -802,27 +822,27 @@ within it.
|
|||
|
||||
In this case, we've set this LightningModel to predict logits. But we could also have it predict feature maps:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class MNISTRepresentator(pl.LightningModule):
|
||||
class MNISTRepresentator(LightningModule):
|
||||
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x1 = torch.relu(x)
|
||||
x = self.layer_2(x1)
|
||||
x2 = torch.relu(x)
|
||||
x3 = self.layer_3(x2)
|
||||
return [x, x1, x2, x3]
|
||||
def forward(self, x):
|
||||
batch_size, channels, width, height = x.size()
|
||||
x = x.view(batch_size, -1)
|
||||
x = self.layer_1(x)
|
||||
x1 = torch.relu(x)
|
||||
x = self.layer_2(x1)
|
||||
x2 = torch.relu(x)
|
||||
x3 = self.layer_3(x2)
|
||||
return [x, x1, x2, x3]
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
out, l1_feats, l2_feats, l3_feats = self(x)
|
||||
logits = torch.log_softmax(out, dim=1)
|
||||
ce_loss = F.nll_loss(logits, y)
|
||||
loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
|
||||
return loss
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
out, l1_feats, l2_feats, l3_feats = self(x)
|
||||
logits = torch.log_softmax(out, dim=1)
|
||||
ce_loss = F.nll_loss(logits, y)
|
||||
loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
|
||||
return loss
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -832,21 +852,21 @@ In this case, we've set this LightningModel to predict logits. But we could also
|
|||
|
||||
Or maybe we have a model that we use to do generation
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNISTDreamer(pl.LightningModule):
|
||||
class LitMNISTDreamer(LightningModule):
|
||||
|
||||
def forward(self, z):
|
||||
imgs = self.decoder(z)
|
||||
return imgs
|
||||
def forward(self, z):
|
||||
imgs = self.decoder(z)
|
||||
return imgs
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
representation = self.encoder(x)
|
||||
imgs = self(representation)
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
representation = self.encoder(x)
|
||||
imgs = self(representation)
|
||||
|
||||
loss = perceptual_loss(imgs, x)
|
||||
return loss
|
||||
loss = perceptual_loss(imgs, x)
|
||||
return loss
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -871,7 +891,7 @@ Any part of the training, validation and testing loop can be modified.
|
|||
For instance, if you wanted to do your own backward pass, you would override the
|
||||
default implementation
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def backward(self, use_amp, loss, optimizer):
|
||||
if use_amp:
|
||||
|
@ -882,9 +902,9 @@ default implementation
|
|||
|
||||
With your own
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def backward(self, use_amp, loss, optimizer):
|
||||
# do a custom way of backward
|
||||
|
@ -892,7 +912,7 @@ With your own
|
|||
|
||||
Or if you wanted to initialize ddp in a different way than the default one
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def configure_ddp(self, model, device_ids):
|
||||
# Lightning DDP simply routes to test_step, val_step, etc...
|
||||
|
@ -905,9 +925,9 @@ Or if you wanted to initialize ddp in a different way than the default one
|
|||
|
||||
you could do your own:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitMNIST(pl.LightningModule):
|
||||
class LitMNIST(LightningModule):
|
||||
|
||||
def configure_ddp(self, model, device_ids):
|
||||
|
||||
|
@ -916,7 +936,7 @@ you could do your own:
|
|||
return model
|
||||
|
||||
Every single part of training is configurable this way.
|
||||
For a full list look at `lightningModule <lightning-module.rst>`_.
|
||||
For a full list look at `LightningModule <lightning-module.rst>`_.
|
||||
|
||||
---------
|
||||
|
||||
|
@ -925,26 +945,32 @@ Callbacks
|
|||
Another way to add arbitrary functionality is to add a custom callback
|
||||
for hooks that you might care about
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.callbacks import Callback
|
||||
|
||||
class MyPrintingCallback(pl.Callback):
|
||||
class MyPrintingCallback(Callback):
|
||||
|
||||
def on_init_start(self, trainer):
|
||||
print('Starting to init trainer!')
|
||||
|
||||
def on_init_end(self, trainer):
|
||||
print('trainer is init now')
|
||||
print('Trainer is init now')
|
||||
|
||||
def on_train_end(self, trainer, pl_module):
|
||||
print('do something when training ends')
|
||||
|
||||
And pass the callbacks into the trainer
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
Trainer(callbacks=[MyPrintingCallback()])
|
||||
trainer = Trainer(callbacks=[MyPrintingCallback()])
|
||||
|
||||
.. testoutput::
|
||||
:hide:
|
||||
|
||||
Starting to init trainer!
|
||||
Trainer is init now
|
||||
|
||||
.. note::
|
||||
See full list of 12+ hooks in the :ref:`callbacks`.
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
Learning Rate Finder
|
||||
--------------------
|
||||
|
||||
|
@ -24,17 +29,18 @@ will automatically be run before any training is done. The ``lr`` that is found
|
|||
and used will be written to the console and logged together with all other
|
||||
hyperparameters of the model.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# default, no automatic learning rate finder
|
||||
Trainer(auto_lr_find=True)
|
||||
trainer = Trainer(auto_lr_find=True)
|
||||
|
||||
When the ``lr`` or ``learning_rate`` key in hparams exists, this flag sets your learning_rate.
|
||||
In both cases, if the respective fields are not found, an error will be thrown.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def __init__(self, hparams):
|
||||
self.hparams = hparams
|
||||
|
||||
|
@ -43,14 +49,14 @@ In both cases, if the respective fields are not found, an error will be thrown.
|
|||
|
||||
# finds learning rate automatically
|
||||
# sets hparams.lr or hparams.learning_rate to that learning rate
|
||||
Trainer(auto_lr_find=True)
|
||||
trainer = Trainer(auto_lr_find=True)
|
||||
|
||||
To use an arbitrary value set it in the parameter.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# to set to your own hparams.my_value
|
||||
Trainer(auto_lr_find='my_value')
|
||||
trainer = Trainer(auto_lr_find='my_value')
|
||||
|
||||
Under the hood, when you call fit, this is what happens.
|
||||
|
||||
|
@ -72,7 +78,7 @@ of this would look like
|
|||
.. code-block:: python
|
||||
|
||||
model = MyModelClass(hparams)
|
||||
trainer = pl.Trainer()
|
||||
trainer = Trainer()
|
||||
|
||||
# Run learning rate finder
|
||||
lr_finder = trainer.lr_find(model)
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
.. testsetup:: *
|
||||
|
||||
import torch
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
.. _multi-gpu-training:
|
||||
|
||||
Multi-GPU training
|
||||
|
@ -13,7 +19,7 @@ Delete .cuda() or .to() calls
|
|||
|
||||
Delete any calls to .cuda() or .to(device).
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# before lightning
|
||||
def forward(self, x):
|
||||
|
@ -30,7 +36,7 @@ Init using type_as
|
|||
When you need to create a new tensor, use `type_as`.
|
||||
This will make your code scale to any arbitrary number of GPUs or TPUs with Lightning
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# before lightning
|
||||
def forward(self, x):
|
||||
|
@ -47,7 +53,7 @@ Remove samplers
|
|||
For multi-node or TPU training, in PyTorch we must use `torch.nn.DistributedSampler`. The
|
||||
sampler makes sure each GPU sees the appropriate part of your data.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# without lightning
|
||||
def train_dataloader(self):
|
||||
|
@ -62,7 +68,7 @@ sampler makes sure each GPU sees the appropriate part of your data.
|
|||
With Lightning, you don't need to do this because it takes care of adding the correct samplers
|
||||
when needed.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# with lightning
|
||||
def train_dataloader(self):
|
||||
|
@ -131,10 +137,11 @@ each GPU will process 16 samples, after which the root node will aggregate the r
|
|||
|
||||
.. warning:: DP use is discouraged by PyTorch and Lightning. Use ddp which is more stable and at least 3x faster
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: torch.cuda.device_count() < 2
|
||||
|
||||
# train on 1 GPU (using dp mode)
|
||||
trainer = pl.Trainer(gpus=2, distributed_backend='dp')
|
||||
# train on 2 GPUs (using dp mode)
|
||||
trainer = Trainer(gpus=2, distributed_backend='dp')
|
||||
|
||||
Distributed Data Parallel
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -157,10 +164,10 @@ Distributed Data Parallel
|
|||
.. code-block:: python
|
||||
|
||||
# train on 8 GPUs (same machine (ie: node))
|
||||
trainer = pl.Trainer(gpus=8, distributed_backend='ddp')
|
||||
trainer = Trainer(gpus=8, distributed_backend='ddp')
|
||||
|
||||
# train on 32 GPUs (4 nodes)
|
||||
trainer = pl.Trainer(gpus=8, distributed_backend='ddp', num_nodes=4)
|
||||
trainer = Trainer(gpus=8, distributed_backend='ddp', num_nodes=4)
|
||||
|
||||
Distributed Data Parallel 2
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -182,7 +189,7 @@ In this case, we can use ddp2 which behaves like dp in a machine and ddp across
|
|||
.. code-block:: python
|
||||
|
||||
# train on 32 GPUs (4 nodes)
|
||||
trainer = pl.Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4)
|
||||
trainer = Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4)
|
||||
|
||||
Horovod
|
||||
^^^^^^^
|
||||
|
@ -202,15 +209,15 @@ Horovod can be configured in the training script to run with any number of GPUs
|
|||
.. code-block:: python
|
||||
|
||||
# train Horovod on GPU (number of GPUs / machines provided on command-line)
|
||||
trainer = pl.Trainer(distributed_backend='horovod', gpus=1)
|
||||
trainer = Trainer(distributed_backend='horovod', gpus=1)
|
||||
|
||||
# train Horovod on CPU (number of processes / machines provided on command-line)
|
||||
trainer = pl.Trainer(distributed_backend='horovod')
|
||||
trainer = Trainer(distributed_backend='horovod')
|
||||
|
||||
When starting the training job, the driver application will then be used to specify the total
|
||||
number of worker processes:
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: bash
|
||||
|
||||
# run training with 4 GPUs on a single machine
|
||||
horovodrun -np 4 python train.py
|
||||
|
@ -226,7 +233,7 @@ DP/DDP2 caveats
|
|||
In DP and DDP2 each GPU within a machine sees a portion of a batch.
|
||||
DP and ddp2 roughly do the following:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def distributed_forward(batch, model):
|
||||
batch = torch.Tensor(32, 8)
|
||||
|
@ -245,7 +252,7 @@ DP and ddp2 roughly do the following:
|
|||
So, when Lightning calls any of the `training_step`, `validation_step`, `test_step`
|
||||
you will only be operating on one of those pieces.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# the batch here is a portion of the FULL batch
|
||||
def training_step(self, batch, batch_idx):
|
||||
|
@ -255,7 +262,7 @@ For most metrics, this doesn't really matter. However, if you want
|
|||
to add something to your computational graph (like softmax)
|
||||
using all batch parts you can use the `training_step_end` step.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def training_step_end(self, outputs):
|
||||
# only use when on dp
|
||||
|
@ -288,7 +295,7 @@ In pseudocode, the full sequence is:
|
|||
|
||||
to illustrate why this is needed, let's look at dataparallel
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
|
@ -313,13 +320,13 @@ it will behave the same no matter the backend.
|
|||
|
||||
Validation and test step also have the same option when using dp
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def validation_step_end(self, batch_parts_outputs):
|
||||
...
|
||||
def validation_step_end(self, batch_parts_outputs):
|
||||
...
|
||||
|
||||
def test_step_end(self, batch_parts_outputs):
|
||||
...
|
||||
def test_step_end(self, batch_parts_outputs):
|
||||
...
|
||||
|
||||
Implement Your Own Distributed (DDP) training
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -335,7 +342,7 @@ batch size.
|
|||
|
||||
Let's say you have a batch size of 7 in your dataloader.
|
||||
|
||||
.. code-block::
|
||||
.. testcode::
|
||||
|
||||
class LitModel(LightningModule):
|
||||
|
||||
|
@ -344,7 +351,7 @@ Let's say you have a batch size of 7 in your dataloader.
|
|||
|
||||
In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes.
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: python
|
||||
|
||||
# effective batch size = 7 * 8
|
||||
Trainer(gpus=8, distributed_backend='ddp|horovod')
|
||||
|
@ -356,7 +363,7 @@ In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes.
|
|||
In DDP2, your effective batch size will be 7 * num_nodes.
|
||||
The reason is that the full batch is visible to all GPUs on the node when using DDP2.
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: python
|
||||
|
||||
# effective batch size = 7
|
||||
Trainer(gpus=8, distributed_backend='ddp2')
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
Multiple Datasets
|
||||
=================
|
||||
Lightning supports multiple dataloaders in a few ways.
|
||||
|
@ -14,7 +18,7 @@ dataloaders).
|
|||
|
||||
(`reference <https://discuss.pytorch.org/t/train-simultaneously-on-two-datasets/649/2>`_)
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class ConcatDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, *datasets):
|
||||
|
@ -27,6 +31,7 @@ dataloaders).
|
|||
return min(len(d) for d in self.datasets)
|
||||
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def train_dataloader(self):
|
||||
concat_dataset = ConcatDataset(
|
||||
datasets.ImageFolder(traindir_A),
|
||||
|
@ -44,9 +49,11 @@ dataloaders).
|
|||
|
||||
def val_dataloader(self):
|
||||
# SAME
|
||||
...
|
||||
|
||||
def test_dataloader(self):
|
||||
# SAME
|
||||
...
|
||||
|
||||
Test/Val dataloaders
|
||||
--------------------
|
||||
|
@ -58,7 +65,7 @@ See the following for more details:
|
|||
- :meth:`~pytorch_lightning.core.LightningModule.val_dataloader`
|
||||
- :meth:`~pytorch_lightning.core.LightningModule.test_dataloader`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def val_dataloader(self):
|
||||
loader_1 = Dataloader()
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
|
||||
Quick Start
|
||||
===========
|
||||
|
||||
|
@ -13,7 +20,8 @@ To illustrate, here's the typical PyTorch project structure organized in a Light
|
|||
Step 1: Define a LightningModule
|
||||
---------------------------------
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: not TORCHVISION_AVAILABLE
|
||||
|
||||
import os
|
||||
|
||||
|
@ -22,10 +30,9 @@ Step 1: Define a LightningModule
|
|||
from torch.utils.data import DataLoader
|
||||
from torchvision.datasets import MNIST
|
||||
from torchvision import transforms
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
import pytorch_lightning as pl
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
@ -53,7 +60,8 @@ Step 1: Define a LightningModule
|
|||
Step 2: Fit with a Trainer
|
||||
--------------------------
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: torch.cuda.device_count() < 8
|
||||
|
||||
from pytorch_lightning import Trainer
|
||||
|
||||
|
@ -68,13 +76,13 @@ Under the hood, lightning does (in high-level pseudocode):
|
|||
.. code-block:: python
|
||||
|
||||
model = LitModel()
|
||||
train_dataloader = model.train_dataloader
|
||||
train_dataloader = model.train_dataloader()
|
||||
optimizer = model.configure_optimizers()
|
||||
|
||||
for epoch in epochs:
|
||||
train_outs = []
|
||||
for batch in train_dataloader:
|
||||
loss = model.training_step()
|
||||
loss = model.training_step(batch)
|
||||
loss.backward()
|
||||
train_outs.append(loss.detach())
|
||||
|
||||
|
@ -88,9 +96,9 @@ Validation loop
|
|||
---------------
|
||||
To also add a validation loop add the following functions
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
|
@ -118,7 +126,11 @@ And now the trainer will call the validation loop automatically
|
|||
|
||||
Under the hood in pseudocode, lightning does the following:
|
||||
|
||||
.. code-block:: python
|
||||
.. testsetup:: *
|
||||
|
||||
train_dataloader = []
|
||||
|
||||
.. testcode::
|
||||
|
||||
# ...
|
||||
for batch in train_dataloader:
|
||||
|
@ -145,9 +157,9 @@ Test loop
|
|||
---------
|
||||
You might also need a test loop
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
|
|
|
@ -5,7 +5,7 @@ Learning rate scheduling
|
|||
-------------------------------------
|
||||
Every optimizer you use can be paired with any `LearningRateScheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# no LR scheduler
|
||||
def configure_optimizers(self):
|
||||
|
@ -44,7 +44,7 @@ Use multiple optimizers (like GANs)
|
|||
-------------------------------------
|
||||
To use multiple optimizers return > 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# one optimizer
|
||||
def configure_optimizers(self):
|
||||
|
@ -79,7 +79,7 @@ override the :meth:`optimizer_step` function.
|
|||
|
||||
For example, here step optimizer A every 2 batches and optimizer B every 4 batches
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
|
||||
optimizer.step()
|
||||
|
@ -104,7 +104,7 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
|
|||
|
||||
Here we add a learning-rate warm up
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# learning rate warm-up
|
||||
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from torch.utils.data import IterableDataset
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
Sequential Data
|
||||
================
|
||||
Lightning has built in support for dealing with sequential data.
|
||||
|
@ -10,9 +15,9 @@ When using PackedSequence, do 2 things:
|
|||
1. return either a padded tensor in dataset or a list of variable length tensors in the dataloader collate_fn (example above shows the list implementation).
|
||||
2. Pack the sequence in forward or training and validation steps depending on use case.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# For use in dataloader
|
||||
# For use in dataloader
|
||||
def collate_fn(batch):
|
||||
x = [item[0] for item in batch]
|
||||
y = [item[1] for item in batch]
|
||||
|
@ -30,7 +35,7 @@ For example, it may save memory to use Truncated Backpropagation Through Time wh
|
|||
|
||||
Lightning can handle TBTT automatically via this flag.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT (single backwards pass per batch)
|
||||
trainer = Trainer(truncated_bptt_steps=None)
|
||||
|
@ -54,7 +59,7 @@ option when using sequential data.
|
|||
This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate
|
||||
the validation interval when val_check_interval is less than one.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# IterableDataset
|
||||
class CustomDataset(IterableDataset):
|
||||
|
@ -73,5 +78,7 @@ option when using sequential data.
|
|||
dataloader = DataLoader(dataset=iterable_dataset, batch_size=5)
|
||||
return dataloader
|
||||
|
||||
.. testcode::
|
||||
|
||||
# Set val_check_interval
|
||||
trainer = pl.Trainer()
|
||||
trainer = Trainer(val_check_interval=100)
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
Single GPU Training
|
||||
====================
|
||||
Make sure you are running on a machine that has at least one GPU. Lightning handles all the NVIDIA flags for you,
|
||||
there's no need to set them yourself.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: torch.cuda.device_count() < 1
|
||||
|
||||
# train on 1 GPU (using dp mode)
|
||||
trainer = pl.Trainer(gpus=1)
|
||||
trainer = Trainer(gpus=1)
|
|
@ -1,103 +1,107 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
Computing cluster (SLURM)
|
||||
==========================
|
||||
=========================
|
||||
|
||||
Lightning automates job the details behind training on a SLURM powered cluster.
|
||||
|
||||
.. _multi-node:
|
||||
|
||||
Multi-node training
|
||||
--------------------
|
||||
-------------------
|
||||
To train a model using multiple-nodes do the following:
|
||||
|
||||
1. Design your LightningModule.
|
||||
1. Design your LightningModule.
|
||||
|
||||
2. Enable ddp in the trainer
|
||||
2. Enable ddp in the trainer
|
||||
|
||||
.. code-block:: python
|
||||
.. code-block:: python
|
||||
|
||||
# train on 32 GPUs across 4 nodes
|
||||
trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp')
|
||||
# train on 32 GPUs across 4 nodes
|
||||
trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp')
|
||||
|
||||
3. It's a good idea to structure your train.py file like this:
|
||||
3. It's a good idea to structure your train.py file like this:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# train.py
|
||||
def main(hparams):
|
||||
model = LightningTemplateModel(hparams)
|
||||
# train.py
|
||||
def main(hparams):
|
||||
model = LightningTemplateModel(hparams)
|
||||
|
||||
trainer = pl.Trainer(
|
||||
gpus=8,
|
||||
num_nodes=4,
|
||||
distributed_backend='ddp'
|
||||
)
|
||||
trainer = pl.Trainer(
|
||||
gpus=8,
|
||||
num_nodes=4,
|
||||
distributed_backend='ddp'
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
trainer.fit(model)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
root_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
parent_parser = ArgumentParser(add_help=False)
|
||||
hyperparams = parser.parse_args()
|
||||
if __name__ == '__main__':
|
||||
root_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
parent_parser = ArgumentParser(add_help=False)
|
||||
hyperparams = parser.parse_args()
|
||||
|
||||
# TRAIN
|
||||
main(hyperparams)
|
||||
# TRAIN
|
||||
main(hyperparams)
|
||||
|
||||
4. Create the appropriate SLURM job
|
||||
4. Create the appropriate SLURM job
|
||||
|
||||
.. code-block:: bash
|
||||
.. code-block:: bash
|
||||
|
||||
# (submit.sh)
|
||||
#!/bin/bash -l
|
||||
# (submit.sh)
|
||||
#!/bin/bash -l
|
||||
|
||||
# SLURM SUBMIT SCRIPT
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --mem=0
|
||||
#SBATCH --time=0-02:00:00
|
||||
# SLURM SUBMIT SCRIPT
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --mem=0
|
||||
#SBATCH --time=0-02:00:00
|
||||
|
||||
# activate conda env
|
||||
source activate $1
|
||||
# activate conda env
|
||||
source activate $1
|
||||
|
||||
# -------------------------
|
||||
# debugging flags (optional)
|
||||
export NCCL_DEBUG=INFO
|
||||
export PYTHONFAULTHANDLER=1
|
||||
# -------------------------
|
||||
# debugging flags (optional)
|
||||
export NCCL_DEBUG=INFO
|
||||
export PYTHONFAULTHANDLER=1
|
||||
|
||||
# on your cluster you might need these:
|
||||
# set the network interface
|
||||
# export NCCL_SOCKET_IFNAME=^docker0,lo
|
||||
# on your cluster you might need these:
|
||||
# set the network interface
|
||||
# export NCCL_SOCKET_IFNAME=^docker0,lo
|
||||
|
||||
# might need the latest cuda
|
||||
# module load NCCL/2.4.7-1-cuda.10.0
|
||||
# -------------------------
|
||||
# might need the latest cuda
|
||||
# module load NCCL/2.4.7-1-cuda.10.0
|
||||
# -------------------------
|
||||
|
||||
# run script from above
|
||||
srun python3 train.py
|
||||
# run script from above
|
||||
srun python3 train.py
|
||||
|
||||
5. If you want auto-resubmit (read below), add this line to the submit.sh script
|
||||
5. If you want auto-resubmit (read below), add this line to the submit.sh script
|
||||
|
||||
.. code-block:: bash
|
||||
.. code-block:: bash
|
||||
|
||||
#SBATCH --signal=SIGUSR1@90
|
||||
#SBATCH --signal=SIGUSR1@90
|
||||
|
||||
6. Submit the SLURM job
|
||||
6. Submit the SLURM job
|
||||
|
||||
.. code-block:: bash
|
||||
.. code-block:: bash
|
||||
|
||||
sbatch submit.sh
|
||||
sbatch submit.sh
|
||||
|
||||
.. note:: using :class:`~torch.utils.data.distributed.DistributedSampler` is already handled by Lightning.
|
||||
|
||||
Walltime auto-resubmit
|
||||
-----------------------------------
|
||||
----------------------
|
||||
When you use Lightning in a SLURM cluster, lightning automatically detects when it is about
|
||||
to run into the walltime, and it does the following:
|
||||
|
||||
1. Saves a temporary checkpoint.
|
||||
2. Requeues the job.
|
||||
3. When the job starts, it loads the temporary checkpoint.
|
||||
1. Saves a temporary checkpoint.
|
||||
2. Requeues the job.
|
||||
3. When the job starts, it loads the temporary checkpoint.
|
||||
|
||||
To get this behavior make sure to add the correct signal to your SLURM script
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
Test set
|
||||
==========
|
||||
========
|
||||
Lightning forces the user to run the test set separately to make sure it isn't evaluated by mistake
|
||||
|
||||
|
||||
Test after fit
|
||||
----------------
|
||||
--------------
|
||||
To run the test set after training completes, use this method
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -15,10 +15,9 @@ To run the test set after training completes, use this method
|
|||
# run test set
|
||||
trainer.test()
|
||||
|
||||
|
||||
Test pre-trained model
|
||||
----------------------
|
||||
To run the test set on a pretrained model, use this method.
|
||||
To run the test set on a pre-trained model, use this method.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -36,4 +35,4 @@ To run the test set on a pretrained model, use this method.
|
|||
trainer.test(model)
|
||||
|
||||
In this case, the options you pass to trainer will be used when
|
||||
running the test set (ie: 16-bit, dp, ddp, etc...
|
||||
running the test set (ie: 16-bit, dp, ddp, etc...)
|
|
@ -1,3 +1,8 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
|
||||
|
||||
Training Tricks
|
||||
================
|
||||
Lightning implements various tricks to help during training
|
||||
|
@ -9,7 +14,7 @@ The effect is a large effective batch size of size KxN.
|
|||
|
||||
.. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT (ie: no accumulated grads)
|
||||
trainer = Trainer(accumulate_grad_batches=1)
|
||||
|
@ -22,7 +27,7 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
|
|||
|
||||
.. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
# DEFAULT (ie: don't clip)
|
||||
trainer = Trainer(gradient_clip_val=0)
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
.. testsetup:: *
|
||||
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
Transfer Learning
|
||||
-----------------
|
||||
|
||||
|
@ -7,22 +11,22 @@ Using Pretrained Models
|
|||
Sometimes we want to use a LightningModule as a pretrained model. This is fine because
|
||||
a LightningModule is just a `torch.nn.Module`!
|
||||
|
||||
.. note:: Remember that a pl.LightningModule is EXACTLY a torch.nn.Module but with more capabilities.
|
||||
.. note:: Remember that a LightningModule is EXACTLY a torch.nn.Module but with more capabilities.
|
||||
|
||||
Let's use the `AutoEncoder` as a feature extractor in a separate model.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class Encoder(torch.nn.Module):
|
||||
...
|
||||
|
||||
class AutoEncoder(pl.LightningModule):
|
||||
class AutoEncoder(LightningModule):
|
||||
def __init__(self):
|
||||
self.encoder = Encoder()
|
||||
self.decoder = Decoder()
|
||||
|
||||
class CIFAR10Classifier(pl.LightingModule):
|
||||
class CIFAR10Classifier(LightningModule):
|
||||
def __init__(self):
|
||||
# init the pretrained LightningModule
|
||||
self.feature_extractor = AutoEncoder.load_from_checkpoint(PATH)
|
||||
|
@ -41,15 +45,16 @@ We used our pretrained Autoencoder (a LightningModule) for transfer learning!
|
|||
Example: Imagenet (computer Vision)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
:skipif: not TORCHVISION_AVAILABLE
|
||||
|
||||
import torchvision.models as models
|
||||
|
||||
class ImagenetTranferLearning(pl.LightingModule):
|
||||
class ImagenetTransferLearning(LightningModule):
|
||||
def __init__(self):
|
||||
# init a pretrained resnet
|
||||
num_target_classes = 10
|
||||
self.feature_extractor = model.resnet50(
|
||||
self.feature_extractor = models.resnet50(
|
||||
pretrained=True,
|
||||
num_classes=num_target_classes)
|
||||
self.feature_extractor.eval()
|
||||
|
@ -66,7 +71,7 @@ Finetune
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
model = ImagenetTranferLearning()
|
||||
model = ImagenetTransferLearning()
|
||||
trainer = Trainer()
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -74,7 +79,7 @@ And use it to predict your data of interest
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
model = ImagenetTranferLearning.load_from_checkpoint(PATH)
|
||||
model = ImagenetTransferLearning.load_from_checkpoint(PATH)
|
||||
model.freeze()
|
||||
|
||||
x = some_images_from_cifar10()
|
||||
|
@ -90,26 +95,24 @@ as it is a `torch.nn.Module` subclass.
|
|||
|
||||
Here's a model that uses `Huggingface transformers <https://github.com/huggingface/transformers>`_.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
from transformers import BertModel
|
||||
class BertMNLIFinetuner(LightningModule):
|
||||
|
||||
class BertMNLIFinetuner(pl.LightningModule):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
|
||||
self.W = nn.Linear(bert.config.hidden_size, 3)
|
||||
self.num_classes = 3
|
||||
self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
|
||||
self.W = nn.Linear(bert.config.hidden_size, 3)
|
||||
self.num_classes = 3
|
||||
|
||||
|
||||
def forward(self, input_ids, attention_mask, token_type_ids):
|
||||
def forward(self, input_ids, attention_mask, token_type_ids):
|
||||
|
||||
h, _, attn = self.bert(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids)
|
||||
h, _, attn = self.bert(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids)
|
||||
|
||||
h_cls = h[:, 0]
|
||||
logits = self.W(h_cls)
|
||||
return logits, attn
|
||||
h_cls = h[:, 0]
|
||||
logits = self.W(h_cls)
|
||||
return logits, attn
|
|
@ -1,3 +1,10 @@
|
|||
.. testsetup:: *
|
||||
|
||||
import os
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
|
||||
|
||||
Saving and loading weights
|
||||
==========================
|
||||
|
||||
|
@ -22,13 +29,13 @@ Automatic saving
|
|||
Checkpointing is enabled by default to the current working directory.
|
||||
To change the checkpoint path pass in:
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
Trainer(default_save_path='/your/path/to/save/checkpoints')
|
||||
trainer = Trainer(default_save_path='/your/path/to/save/checkpoints')
|
||||
|
||||
To modify the behavior of checkpointing pass in your own callback.
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
|
||||
|
@ -47,17 +54,16 @@ To modify the behavior of checkpointing pass in your own callback.
|
|||
|
||||
Or disable it by passing
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
trainer = Trainer(checkpoint_callback=False)
|
||||
trainer = Trainer(checkpoint_callback=False)
|
||||
|
||||
|
||||
The Lightning checkpoint also saves the hparams (hyperparams) passed into the LightningModule init.
|
||||
|
||||
.. note:: hparams is a `Namespace <https://docs.python.org/2/library/argparse.html#argparse.Namespace>`_.
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 8
|
||||
.. testcode::
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
|
@ -67,9 +73,9 @@ The Lightning checkpoint also saves the hparams (hyperparams) passed into the Li
|
|||
# define you module to have hparams as the first arg
|
||||
# this means your checkpoint will have everything that went into making
|
||||
# this model (in this case, learning rate)
|
||||
class MyLightningModule(pl.LightningModule):
|
||||
class MyLightningModule(LightningModule):
|
||||
|
||||
def __init__(self, hparams, ...):
|
||||
def __init__(self, hparams, *args, **kwargs):
|
||||
self.hparams = hparams
|
||||
|
||||
Manual saving
|
||||
|
@ -78,7 +84,7 @@ You can manually save checkpoints and restore your model from the checkpointed s
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
model = MyModel(hparams)
|
||||
model = MyLightningModule(hparams)
|
||||
trainer.fit(model)
|
||||
trainer.save_checkpoint("example.ckpt")
|
||||
new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt")
|
||||
|
@ -96,9 +102,9 @@ To load a model along with its weights, biases and hyperparameters use following
|
|||
|
||||
The above only works if you used `hparams` in your model definition
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class MyModel(pl.LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def __init__(self, hparams):
|
||||
self.hparams = hparams
|
||||
|
@ -106,9 +112,9 @@ The above only works if you used `hparams` in your model definition
|
|||
|
||||
But if you don't and instead pass individual parameters
|
||||
|
||||
.. code-block:: python
|
||||
.. testcode::
|
||||
|
||||
class MyModel(pl.LightningModule):
|
||||
class LitModel(LightningModule):
|
||||
|
||||
def __init__(self, in_dim, out_dim):
|
||||
self.l1 = nn.Linear(in_dim, out_dim)
|
||||
|
@ -117,7 +123,7 @@ you can restore the model like this
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
model = MyModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
|
||||
model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
|
||||
|
||||
|
||||
Restoring Training State
|
||||
|
|
Loading…
Reference in New Issue