Drop all result docs. Make the separation between flow and logging clear (#3744)
* remove results docs. separate flow from log
This commit is contained in:
parent
9405c880af
commit
00ba2b24b7
|
@ -50,27 +50,25 @@ The only things that change in the `Autoencoder` model are the init, forward, tr
|
|||
def training_step(self, batch, batch_idx):
|
||||
x, _ = batch
|
||||
|
||||
representation = self(x)
|
||||
representation = self.encoder(x)
|
||||
x_hat = self.decoder(representation)
|
||||
|
||||
loss = self.metric(x, x_hat)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
return self._shared_eval(batch, batch_idx, 'val')
|
||||
self._shared_eval(batch, batch_idx, 'val')
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
return self._shared_eval(batch, batch_idx, 'test')
|
||||
self._shared_eval(batch, batch_idx, 'test')
|
||||
|
||||
def _shared_eval(self, batch, batch_idx, prefix):
|
||||
x, _ = batch
|
||||
representation = self(x)
|
||||
representation = self.encoder(x)
|
||||
x_hat = self.decoder(representation)
|
||||
|
||||
loss = self.metric(x, x_hat)
|
||||
result = pl.EvalResult()
|
||||
result.log(f'{prefix}_loss', loss)
|
||||
return result
|
||||
self.log(f'{prefix}_loss', loss)
|
||||
|
||||
|
||||
and we can train this using the same trainer
|
||||
|
|
|
@ -14,15 +14,6 @@ You can stop an epoch early by overriding :meth:`~pytorch_lightning.core.lightni
|
|||
|
||||
If you do this repeatedly, for every epoch you had originally requested, then this will stop your entire run.
|
||||
|
||||
----------
|
||||
|
||||
Default Epoch End Callback Behavior
|
||||
-----------------------------------
|
||||
By default early stopping will be enabled if the `early_stop_on` key in the EvalResult object is used
|
||||
in either the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` method or
|
||||
the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end` method.
|
||||
|
||||
|
||||
----------
|
||||
|
||||
Enable Early Stopping using the EarlyStopping Callback
|
||||
|
@ -31,24 +22,17 @@ The
|
|||
:class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
|
||||
callback can be used to monitor a validation metric and stop the training when no improvement is observed.
|
||||
|
||||
There are two ways to enable the EarlyStopping callback:
|
||||
To enable it:
|
||||
|
||||
- Set `early_stop_callback=True`.
|
||||
If a dict is returned by
|
||||
:meth:`~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end`,
|
||||
the callback will look for `val_loss` in the dict
|
||||
and display a warning if `val_loss` is not present.
|
||||
Otherwise, if a :class:`~pytorch_lightning.core.step_result.Result` is returned by
|
||||
:meth:`~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end`,
|
||||
:meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` or
|
||||
:meth:`~pytorch_lightning.core.lightning.LightningModule.training_step`,
|
||||
the `early_stop_on` metric, specified in the initialization of the
|
||||
:class:`~pytorch_lightning.core.step_result.Result` object is used
|
||||
and display a warning if it was not specified.
|
||||
- Set `early_stop_callback=True`.
|
||||
- Set `monitor` to the logged metric of your choice
|
||||
|
||||
.. testcode::
|
||||
.. code-block:: python
|
||||
|
||||
trainer = Trainer(early_stop_callback=True)
|
||||
def validation_step(...):
|
||||
self.log('val_loss', loss)
|
||||
|
||||
trainer = Trainer(early_stop_callback=EarlyStopping(monitor='val_loss'))
|
||||
|
||||
- Create the callback object and pass it to the trainer.
|
||||
This allows for further customization.
|
||||
|
|
|
@ -39,7 +39,6 @@ PyTorch Lightning Documentation
|
|||
datamodules
|
||||
loggers
|
||||
metrics
|
||||
results
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
|
|
@ -395,46 +395,6 @@ in the LightningModule
|
|||
Again, this is the same PyTorch code except that it has been organized by the LightningModule.
|
||||
This code is not restricted which means it can be as complicated as a full seq-2-seq, RL loop, GAN, etc...
|
||||
|
||||
TrainResult
|
||||
^^^^^^^^^^^
|
||||
Whenever you'd like to log, or sync values across GPUs use `TrainResult`.
|
||||
|
||||
- log to Tensorboard or the other logger of your choice.
|
||||
- log to the progress-bar.
|
||||
- log on every step.
|
||||
- log aggregate epoch metrics.
|
||||
- average values across GPUs/TPU cores
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(...):
|
||||
return loss
|
||||
|
||||
# equivalent
|
||||
return pl.TrainResult(loss)
|
||||
|
||||
# log a metric
|
||||
result = pl.TrainResult(loss)
|
||||
result.log('train_loss', loss)
|
||||
|
||||
# equivalent
|
||||
result.log('train_loss', loss, on_step=True, on_epoch=False, prog_bar=False, logger=True, reduce_fx=torch.mean)
|
||||
|
||||
When training across accelerators (GPUs/TPUs) you can sync a metric if needed.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# sync across GPUs / TPUs, etc...
|
||||
result.log('train_loss', loss, sync_dist=True)
|
||||
|
||||
If you are only using a training_loop (`training_step`) without a
|
||||
validation or test loop (`validation_step`, `test_step`), you can still use EarlyStopping or automatic checkpointing
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result = pl.TrainResult(loss, checkpoint_on=loss, early_stop_on=loss)
|
||||
return result
|
||||
|
||||
----------------
|
||||
|
||||
The engineering
|
||||
|
@ -477,30 +437,52 @@ For clarity, we'll recall that the full LightningModule now looks like this.
|
|||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
|
||||
# using TrainResult to enable logging
|
||||
result = pl.TrainResult(loss)
|
||||
result.log('train_loss', loss)
|
||||
|
||||
return result
|
||||
return loss
|
||||
|
||||
Again, this is the same PyTorch code, except that it's organized by the LightningModule.
|
||||
|
||||
Auto Logging
|
||||
^^^^^^^^^^^^
|
||||
When we added the `TrainResult` in the return dictionary it went into the built-in tensorboard logger.
|
||||
But you could have also logged by calling:
|
||||
Logging
|
||||
^^^^^^^
|
||||
To log to Tensorboard, your favorite logger, and/or the progress bar, use the
|
||||
:func:`~~pytorch_lightning.core.lightning.LightningModule.log` method which can be called from
|
||||
any method in the LightningModule.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
# ...
|
||||
loss = ...
|
||||
self.logger.summary.scalar('loss', loss, step=self.global_step)
|
||||
self.log('my_metric', x)
|
||||
|
||||
The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a few options:
|
||||
|
||||
- on_step (logs the metric at that step in training)
|
||||
- on_epoch (automatically accumulates and logs at the end of the epoch)
|
||||
- prog_bar (logs to the progress bar)
|
||||
- logger (logs to the logger like Tensorboard)
|
||||
|
||||
Depending on where log is called from, Lightning auto-determines the correct mode for you. But of course
|
||||
you can override the default behavior by manually setting the flags
|
||||
|
||||
.. note:: Setting on_epoch=True will accumulate your logged values over the full training epoch.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
|
||||
|
||||
You can also use any method of your logger directly:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
tensorboard = self.logger.experiment
|
||||
tensorboard.any_summary_writer_method_you_want())
|
||||
|
||||
Once your training starts, you can view the logs by using your favorite logger or booting up the Tensorboard logs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
tensorboard --logdir ./lightning_logs
|
||||
|
||||
# equivalent
|
||||
result = TrainResult()
|
||||
result.log('loss', loss)
|
||||
|
||||
Which will generate automatic tensorboard logs.
|
||||
|
||||
|
@ -678,20 +660,11 @@ split of the data reaches a minimum.
|
|||
Just like the `training_step`, we can define a `validation_step` to check whatever
|
||||
metrics we care about, generate samples or add more to our logs.
|
||||
|
||||
Since the `validation_step` processes a single batch, use the `EvalResult` to log metrics for the full epoch.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
loss = MSE_loss(...)
|
||||
|
||||
# loss is a tensor. The Checkpoint Callback is monitoring 'checkpoint_on'
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('val_loss', loss)
|
||||
|
||||
# equivalent
|
||||
result.log('val_loss', loss, prog_bar=False, logger=True, on_step=False, on_epoch=True, reduce_fx=torch.mean)
|
||||
return result
|
||||
self.log('val_loss', loss)
|
||||
|
||||
Now we can train with a validation loop as well.
|
||||
|
||||
|
@ -744,13 +717,12 @@ If you still need even more fine-grain control, define the other optional method
|
|||
.. code-block:: python
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
result = pl.EvalResult()
|
||||
result.prediction = some_prediction
|
||||
return result
|
||||
preds = ...
|
||||
return preds
|
||||
|
||||
def validation_epoch_end(self, val_step_outputs):
|
||||
# do something with all the predictions from each validation_step
|
||||
all_predictions = val_step_outputs.prediction
|
||||
for pred in val_step_outputs:
|
||||
# do something with all the predictions from each validation_step
|
||||
|
||||
----------------
|
||||
|
||||
|
@ -768,9 +740,7 @@ Just like the validation loop, we define a test loop
|
|||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = F.nll_loss(logits, y)
|
||||
result = pl.EvalResult()
|
||||
result.log('test_loss', loss)
|
||||
return result
|
||||
self.log('test_loss', loss)
|
||||
|
||||
|
||||
However, to make sure the test set isn't used inadvertently, Lightning has a separate API to run tests.
|
||||
|
|
|
@ -105,7 +105,7 @@ Here are the only required methods.
|
|||
... x, y = batch
|
||||
... y_hat = self(x)
|
||||
... loss = F.cross_entropy(y_hat, y)
|
||||
... return pl.TrainResult(loss)
|
||||
... return loss
|
||||
...
|
||||
... def configure_optimizers(self):
|
||||
... return torch.optim.Adam(self.parameters(), lr=0.02)
|
||||
|
@ -162,7 +162,7 @@ A LightningModule is best used to define a complex system:
|
|||
|
||||
# reconstruction
|
||||
reconstruction_loss = nn.functional.mse_loss(recons, x)
|
||||
return pl.TrainResult(reconstruction_loss)
|
||||
return reconstruction_loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, _ = batch
|
||||
|
@ -170,9 +170,7 @@ A LightningModule is best used to define a complex system:
|
|||
z = self.encoder(x)
|
||||
recons = self.decoder(z)
|
||||
reconstruction_loss = nn.functional.mse_loss(recons, x)
|
||||
|
||||
result = pl.EvalResult(checkpoint_on=reconstruction_loss)
|
||||
return result
|
||||
self.log('val_reconstruction', reconstruction_loss)
|
||||
|
||||
def configure_optimizers(self):
|
||||
return torch.optim.Adam(self.parameters(), lr=0.0002)
|
||||
|
@ -210,12 +208,12 @@ Note that in this case, the train loop and val loop are exactly the same. We can
|
|||
|
||||
def training_step(self, batch, batch_idx):
|
||||
loss = self.shared_step(batch)
|
||||
return pl.TrainResult(loss)
|
||||
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
loss = self.shared_step(batch)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
return result
|
||||
self.log('val_loss', loss)
|
||||
|
||||
def shared_step(self, batch):
|
||||
x, _ = batch
|
||||
|
@ -281,7 +279,7 @@ For cases like production, you might want to iterate different models inside a L
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
return pl.TrainResult(loss)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
|
@ -290,14 +288,14 @@ For cases like production, you might want to iterate different models inside a L
|
|||
acc = FM.accuracy(y_hat, y)
|
||||
|
||||
# loss is tensor. The Checkpoint Callback is monitoring 'checkpoint_on'
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log_dict({'val_acc': acc, 'val_loss': loss})
|
||||
return result
|
||||
metrics = {'val_acc': acc, 'val_loss': loss}
|
||||
self.log_dict(metrics)
|
||||
return metrics
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
result = self.validation_step(batch, batch_idx)
|
||||
result.rename_keys({'val_acc': 'test_acc', 'val_loss': 'test_loss'})
|
||||
return result
|
||||
metrics = self.validation_step(batch, batch_idx)
|
||||
metrics = {'test_acc': metrics['val_acc'], 'test_loss': metrics['val_loss']}
|
||||
self.log_dict(metrics)
|
||||
|
||||
def configure_optimizers(self):
|
||||
return torch.optim.Adam(self.model.parameters(), lr=0.02)
|
||||
|
@ -361,7 +359,7 @@ To add a training loop use the `training_step` method
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
return pl.TrainResult(loss)
|
||||
return loss
|
||||
|
||||
Under the hood, Lightning does the following (pseudocode):
|
||||
|
||||
|
@ -385,7 +383,7 @@ Under the hood, Lightning does the following (pseudocode):
|
|||
|
||||
Training epoch-level metrics
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
If you want to calculate epoch-level metrics and log them, use the `TrainResult.log` method
|
||||
If you want to calculate epoch-level metrics and log them, use the `.log` method
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -393,13 +391,12 @@ If you want to calculate epoch-level metrics and log them, use the `TrainResult.
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.TrainResult(loss)
|
||||
|
||||
# logs metrics for each training_step, and the average across the epoch, to the progress bar and logger
|
||||
result.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
|
||||
return result
|
||||
return loss
|
||||
|
||||
The `TrainResult.log` object automatically reduces the requested metrics across the full epoch.
|
||||
The `.log` object automatically reduces the requested metrics across the full epoch.
|
||||
Here's the pseudocode of what it does under the hood:
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -428,13 +425,12 @@ If you need to do something with all the outputs of each `training_step`, overri
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.TrainResult(loss)
|
||||
result.prediction = some_prediction
|
||||
preds = ...
|
||||
return {'loss': loss, 'other_stuff': preds}
|
||||
|
||||
def training_epoch_end(self, training_step_outputs):
|
||||
all_predictions = training_step_outputs.prediction
|
||||
...
|
||||
return result
|
||||
for pred in training_step_outputs:
|
||||
# do something
|
||||
|
||||
The matching pseudocode is:
|
||||
|
||||
|
@ -467,20 +463,19 @@ In this case, implement the `training_step_end` method
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.TrainResult(loss)
|
||||
result.prediction = some_prediction
|
||||
pred = ...
|
||||
return {'loss': loss, 'pred': pred}
|
||||
|
||||
def training_step_end(self, batch_parts):
|
||||
gpu_0_prediction = batch_parts.prediction[0]
|
||||
gpu_1_prediction = batch_parts.prediction[1]
|
||||
gpu_0_prediction = batch_parts.pred[0]['pred']
|
||||
gpu_1_prediction = batch_parts.pred[1]['pred']
|
||||
|
||||
# do something with both outputs
|
||||
return result
|
||||
return (batch_parts[0]['loss'] + batch_parts[1]['loss']) / 2
|
||||
|
||||
def training_epoch_end(self, training_step_outputs):
|
||||
all_predictions = training_step_outputs.prediction
|
||||
...
|
||||
return result
|
||||
for out in training_step_outputs:
|
||||
# do something with preds
|
||||
|
||||
The full pseudocode that lighting does under the hood is:
|
||||
|
||||
|
@ -516,8 +511,7 @@ To add a validation loop, override the `validation_step` method of the :class:`~
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
return result
|
||||
self.log('val_loss', loss)
|
||||
|
||||
Under the hood, Lightning does the following:
|
||||
|
||||
|
@ -553,13 +547,12 @@ If you need to do something with all the outputs of each `validation_step`, over
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(loss)
|
||||
result.prediction = some_prediction
|
||||
pred = ...
|
||||
return pred
|
||||
|
||||
def validation_epoch_end(self, validation_step_outputs):
|
||||
all_predictions = validation_step_outputs.prediction
|
||||
...
|
||||
return result
|
||||
for pred in validation_step_outputs:
|
||||
# do something with a pred
|
||||
|
||||
Validating with DataParallel
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -574,20 +567,19 @@ In this case, implement the `validation_step_end` method
|
|||
x, y = batch
|
||||
y_hat = self.model(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(loss)
|
||||
result.prediction = some_prediction
|
||||
pred = ...
|
||||
return {'loss': loss, 'pred': pred}
|
||||
|
||||
def validation_step_end(self, batch_parts):
|
||||
gpu_0_prediction = batch_parts.prediction[0]
|
||||
gpu_1_prediction = batch_parts.prediction[1]
|
||||
gpu_0_prediction = batch_parts.pred[0]['pred']
|
||||
gpu_1_prediction = batch_parts.pred[1]['pred']
|
||||
|
||||
# do something with both outputs
|
||||
return result
|
||||
return (batch_parts[0]['loss'] + batch_parts[1]['loss']) / 2
|
||||
|
||||
def validation_epoch_end(self, validation_step_outputs):
|
||||
all_predictions = validation_step_outputs.prediction
|
||||
...
|
||||
return result
|
||||
for out in validation_step_outputs:
|
||||
# do something with preds
|
||||
|
||||
The full pseudocode that lighting does under the hood is:
|
||||
|
||||
|
@ -751,6 +743,24 @@ save_hyperparameters
|
|||
|
||||
------------
|
||||
|
||||
Logging methods
|
||||
^^^^^^^^^^^^^^^
|
||||
Use these methods to interact with the loggers
|
||||
|
||||
log
|
||||
~~~
|
||||
|
||||
.. autofunction:: pytorch_lightning.core.lightning.LightningModule.log
|
||||
:noindex:
|
||||
|
||||
log_dict
|
||||
~~~~~~~~
|
||||
|
||||
.. autofunction:: pytorch_lightning.core.lightning.LightningModule.log_dict
|
||||
:noindex:
|
||||
|
||||
------------
|
||||
|
||||
Inference methods
|
||||
^^^^^^^^^^^^^^^^^
|
||||
Use these hooks for inference with a lightning module
|
||||
|
|
|
@ -46,90 +46,40 @@ To use multiple loggers, simply pass in a ``list`` or ``tuple`` of loggers ...
|
|||
|
||||
Logging from a LightningModule
|
||||
------------------------------
|
||||
Use the Result objects to log from any lightning module.
|
||||
Interact with loggers in two ways, automatically and/or manually.
|
||||
|
||||
Training loop logging
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
To log in the training loop use the :class:`TrainResult`.
|
||||
Automatic logging
|
||||
^^^^^^^^^^^^^^^^^
|
||||
Use the :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method to log from anywhere in a LightningModule.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
loss = ...
|
||||
self.log('my_metric', x)
|
||||
|
||||
result = pl.TrainResult(minimize=loss)
|
||||
result.log('train_loss', loss)
|
||||
return result
|
||||
The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a few options:
|
||||
|
||||
The `Result` object is simply a dictionary that gives you added methods like `log` and `write`
|
||||
and automatically detaches tensors (except for the minimize value).
|
||||
- on_step (logs the metric at that step in training)
|
||||
- on_epoch (automatically accumulates and logs at the end of the epoch)
|
||||
- prog_bar (logs to the progress bar)
|
||||
- logger (logs to the logger like Tensorboard)
|
||||
|
||||
Depending on where log is called from, Lightning auto-determines the correct mode for you. But of course
|
||||
you can override the default behavior by manually setting the flags
|
||||
|
||||
.. note:: Setting on_epoch=True will accumulate your logged values over the full training epoch.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result = pl.TrainResult(minimize=loss)
|
||||
result.log('train_loss', loss)
|
||||
print(result)
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
|
||||
|
||||
{'train_loss': tensor([0.2262])}
|
||||
Once your training starts, you can view the logs by using your favorite logger or booting up the Tensorboard logs:
|
||||
|
||||
The `TrainResult` can log at two places in the training, on each step (`TrainResult(on_step=True)`) and
|
||||
the aggregate at the end of the epoch (`TrainResult(on_epoch=True)`).
|
||||
.. code-block:: bash
|
||||
|
||||
.. code-block:: python
|
||||
tensorboard --logdir ./lightning_logs
|
||||
|
||||
for epoch in epochs:
|
||||
epoch_outs = []
|
||||
for batch in train_dataloader():
|
||||
# ......
|
||||
out = training_step(batch)
|
||||
# < ----------- log (on_step=True)
|
||||
epoch_outs.append(out)
|
||||
|
||||
# < -------------- log (on_epoch=True)
|
||||
auto_reduce_log(epoch_outs)
|
||||
|
||||
Validation loop logging
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
To log in the training loop use the :class:`EvalResult`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
loss = ...
|
||||
|
||||
result = pl.EvalResult()
|
||||
result.log('val_loss', loss)
|
||||
return result
|
||||
|
||||
The `EvalResult` object is simply a dictionary that gives you added methods like `log` and `write`
|
||||
and automatically detaches tensors (except for the minimize value).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result = pl.EvalResult()
|
||||
result.log('val_loss', loss)
|
||||
print(result)
|
||||
|
||||
{'val_loss': tensor([0.2262])}
|
||||
|
||||
The `EvalResult` can log at two places in the validation loop, on each step (`EvalResult(on_step=True)`) and
|
||||
the aggregate at the end of the epoch (`EvalResult(on_epoch=True)`).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def run_val_loop():
|
||||
epoch_outs = []
|
||||
for batch in val_dataloader():
|
||||
out = validation_step(batch)
|
||||
# < ----------- log (on_step=True)
|
||||
epoch_outs.append(out)
|
||||
|
||||
# < -------------- log (on_epoch=True)
|
||||
auto_reduce_log(epoch_outs)
|
||||
|
||||
Test loop logging
|
||||
^^^^^^^^^^^^^^^^^
|
||||
See the previous section.
|
||||
|
||||
Manual logging
|
||||
^^^^^^^^^^^^^^
|
||||
|
@ -144,14 +94,21 @@ For certain things like histograms, text, images, etc... you may need to use the
|
|||
tensorboard.add_histogram(...)
|
||||
tensorboard.add_figure(...)
|
||||
|
||||
This also applies to Callbacks
|
||||
|
||||
|
||||
----------
|
||||
|
||||
Logging from a Callback
|
||||
-----------------------
|
||||
To log from a callback, access the logger object directly
|
||||
To log from a callback, the :func:`~~pytorch_lightning.core.lightning.LightningModule.log`
|
||||
method of the LightningModule.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class MyCallback(Callback):
|
||||
|
||||
def on_train_epoch_end(self, trainer, pl_module):
|
||||
pl_module.log('something', x)
|
||||
|
||||
or access the logger object directly
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
|
|
@ -281,6 +281,7 @@ a forward method or trace only the sub-models you need.
|
|||
autoencoder.to_onnx(tmpfile.name, input_sample, export_params=True)
|
||||
os.path.isfile(tmpfile.name)
|
||||
|
||||
--------------------
|
||||
|
||||
********************
|
||||
Using CPUs/GPUs/TPUs
|
||||
|
@ -347,6 +348,7 @@ Without changing a SINGLE line of your code, you can now do the following with t
|
|||
val_check_interval=0.25
|
||||
)
|
||||
|
||||
-----------
|
||||
|
||||
***********
|
||||
Checkpoints
|
||||
|
@ -369,89 +371,123 @@ If you prefer to do it manually, here's the equivalent
|
|||
model = LitModel()
|
||||
model.load_state_dict(ckpt['state_dict'])
|
||||
|
||||
*****************
|
||||
Optional features
|
||||
*****************
|
||||
---------
|
||||
|
||||
*********
|
||||
Data flow
|
||||
*********
|
||||
Each loop (training, validation, test) has three hooks you can implement:
|
||||
- x_step
|
||||
- x_step_end
|
||||
- x_epoch_end
|
||||
|
||||
To illustrate how data flows, we'll use the training loop (ie: x=training)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
outs = []
|
||||
for batch in data:
|
||||
out = training_step(batch)
|
||||
outs.append(out)
|
||||
training_epoch_end(outs)
|
||||
|
||||
The equivalent in Lightning is:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
prediction = ...
|
||||
return prediction
|
||||
|
||||
def training_epoch_end(self, training_step_outputs):
|
||||
for prediction in predictions:
|
||||
# do something with these
|
||||
|
||||
In the event that you use DP or DDP2 distributed modes (ie: split a batch across GPUs),
|
||||
use the x_step_end to manually aggregate (or don't implement it to let lightning auto-aggregate for you).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
for batch in data:
|
||||
model_copies = copy_model_per_gpu(model, num_gpus)
|
||||
batch_split = split_batch_per_gpu(batch, num_gpus)
|
||||
|
||||
gpu_outs = []
|
||||
for model, batch_part in zip(model_copies, batch_split):
|
||||
# LightningModule hook
|
||||
gpu_out = model.training_step(batch_part)
|
||||
gpu_outs.append(gpu_out)
|
||||
|
||||
# LightningModule hook
|
||||
out = training_step_end(gpu_outs)
|
||||
|
||||
The lightning equivalent is:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
loss = ...
|
||||
return loss
|
||||
|
||||
def training_step_end(self, losses):
|
||||
gpu_0_loss = losses[0]
|
||||
gpu_1_loss = losses[1]
|
||||
return (gpu_0_loss + gpu_1_loss) * 1/2
|
||||
|
||||
The validation and test loops have the same structure.
|
||||
|
||||
-----------------
|
||||
|
||||
*****************
|
||||
Logging
|
||||
=======
|
||||
If you want to log to Tensorboard or your favorite logger, and/or the progress bar, use the
|
||||
:func:`~~pytorch_lightning.core.lightning.LightningModule.log` method. You can call :func:`~~pytorch_lightning.core.lightning.LightningModule.log` from any part of your code, and
|
||||
have full control on how the logs are aggregated and when.
|
||||
*****************
|
||||
To log to Tensorboard, your favorite logger, and/or the progress bar, use the
|
||||
:func:`~~pytorch_lightning.core.lightning.LightningModule.log` method which can be called from
|
||||
any method in the LightningModule.
|
||||
|
||||
To enable logging in the training loop:
|
||||
.. code-block:: python
|
||||
|
||||
.. code-block::
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.log('my_metric', x)
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a few options:
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
...
|
||||
loss = F.mse_loss(x_hat, x)
|
||||
- on_step (logs the metric at that step in training)
|
||||
- on_epoch (automatically accumulates and logs at the end of the epoch)
|
||||
- prog_bar (logs to the progress bar)
|
||||
- logger (logs to the logger like Tensorboard)
|
||||
|
||||
# .log sends to tensorboard/logger, prog_bar also sends to the progress bar
|
||||
self.log('my_train_loss', loss, prog_bar=True)
|
||||
return loss
|
||||
Depending on where log is called from, Lightning auto-determines the correct mode for you. But of course
|
||||
you can override the default behavior by manually setting the flags
|
||||
|
||||
Lightning can aggregate your logs for each epoch by specifying `on_epoch=True`.
|
||||
.. note:: Setting on_epoch=True will accumulate your logged values over the full training epoch.
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: python
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
def training_step(self, batch, batch_idx):
|
||||
self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
...
|
||||
loss = F.mse_loss(x_hat, x)
|
||||
You can also use any method of your logger directly:
|
||||
|
||||
# Lightning will compute the mean of `my_train_loss` at the end of each epoch
|
||||
self.log('my_train_loss', loss, on_epoch=True)
|
||||
return loss
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
tensorboard = self.logger.experiment
|
||||
tensorboard.any_summary_writer_method_you_want())
|
||||
|
||||
Anything you log in the validation loop will by default be logged at the end of each epoch:
|
||||
Once your training starts, you can view the logs by using your favorite logger or booting up the Tensorboard logs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
.. code-block::
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
...
|
||||
loss = F.mse_loss(x_hat, x)
|
||||
|
||||
# Lightning will compute the mean of `my_train_loss` across epoch
|
||||
self.log('my_val_loss', loss)
|
||||
|
||||
You can always override Lightning deafults to customize any behaviour. If you would like to aggregate manually, you can pass data from
|
||||
your :func:`~~pytorch_lightning.core.lightning.LightningModule.validation_step` to :func:`~~pytorch_lightning.core.lightning.LightningModule.validation_step_end` or :func:`~~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end` by returning a tensor or a dictionary, and you can manually decide what to log in :func:`~~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end`:
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
class LitModel(pl.LightningModule):
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
...
|
||||
|
||||
loss = F.mse_loss(x_hat, x)
|
||||
self.log('val_loss', loss, on_step=False, on_epoch=False)
|
||||
# anything you return will be available in validation_step_end and validation_epoch_end
|
||||
return {'a': gpu_idx}
|
||||
|
||||
def validation_step_end(self, validation_step_output):
|
||||
# {'a': [0, 1, 2, 3]}
|
||||
gpu_0_se = validation_step_output[0]
|
||||
gpu_1_se = validation_step_output[1]
|
||||
gpu_2_se = validation_step_output[2]
|
||||
gpu_3_se = validation_step_output[3]
|
||||
# anything you return will be available in validation_epoch_end
|
||||
return gpu_0_se + gpu_1_se + gpu_2_se + gpu_3_se
|
||||
|
||||
def validation_epoch_end(self, validation_step_outputs):
|
||||
# you can compute your own reduction of metrics or compute anything on values from your validation liip
|
||||
tensorboard --logdir ./lightning_logs
|
||||
|
||||
Read more about :ref:`loggers`.
|
||||
|
||||
----------------
|
||||
|
||||
*****************
|
||||
Optional features
|
||||
*****************
|
||||
|
||||
Callbacks
|
||||
=========
|
||||
|
@ -498,8 +534,8 @@ Things you can do with a callback:
|
|||
:ref:`Learn more about custom callbacks <callbacks>`.
|
||||
|
||||
|
||||
Datamodules
|
||||
===========
|
||||
LightningDataModules
|
||||
====================
|
||||
DataLoaders and data processing code tends to end up scattered around.
|
||||
Make your data code reusable by organizing it into a :class:`~pytorch_lightning.core.datamodule.LightningDataModule`.
|
||||
|
||||
|
@ -630,21 +666,14 @@ Or read our :ref:`introduction_guide` to learn more!
|
|||
-------------
|
||||
|
||||
**********
|
||||
Learn more
|
||||
Community
|
||||
**********
|
||||
|
||||
That's it! Once you build your module, data, and call trainer.fit(), Lightning trainer calls each loop at the correct time as needed.
|
||||
|
||||
You can then boot up your logger or tensorboard instance to view training logs
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
tensorboard --logdir ./lightning_logs
|
||||
Out community of core maintainers and thousands of expert researchers is active on our Slack and Forum. Drop by to
|
||||
hang out, ask Lightning questions or even discuss research!
|
||||
|
||||
Masterclass
|
||||
===========
|
||||
|
||||
Go pro by tunning in to our Masterclass! New episodes every week.
|
||||
We also offer a Masterclass to teach you the advanced uses of Lightning.
|
||||
|
||||
.. image:: _images/general/PTL101_youtube_thumbnail.jpg
|
||||
:width: 500
|
||||
|
|
|
@ -1,248 +0,0 @@
|
|||
.. _results:
|
||||
|
||||
Result
|
||||
======
|
||||
Lightning has two results objects `TrainResult` and `EvalResult`.
|
||||
|
||||
Use these to control:
|
||||
|
||||
- When to log (each step and/or epoch aggregate).
|
||||
- Where to log (progress bar or a logger).
|
||||
- How to sync across accelerators.
|
||||
|
||||
------------------
|
||||
|
||||
Training loop example
|
||||
---------------------
|
||||
Return a `TrainResult` from the Training loop.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch_subset, batch_idx):
|
||||
loss = ...
|
||||
result = pl.TrainResult(minimize=loss)
|
||||
result.log('train_loss', loss, prog_bar=True)
|
||||
return result
|
||||
|
||||
If you'd like to do something special with the outputs other than logging, implement `__epoch_end`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
result = pl.TrainResult(loss)
|
||||
result.some_prediction = some_prediction
|
||||
return result
|
||||
|
||||
def training_epoch_end(self, training_step_output_result):
|
||||
all_train_predictions = training_step_output_result.some_prediction
|
||||
|
||||
training_step_output_result.some_new_prediction = some_new_prediction
|
||||
return training_step_output_result
|
||||
|
||||
--------------------
|
||||
|
||||
Validation/Test loop example
|
||||
-----------------------------
|
||||
Return a `EvalResult` object from a validation/test loop
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
some_metric = ...
|
||||
result = pl.EvalResult(checkpoint_on=some_metric)
|
||||
result.log('some_metric', some_metric, prog_bar=True)
|
||||
return result
|
||||
|
||||
If you'd like to do something special with the outputs other than logging, implement `__epoch_end`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
result = pl.EvalResult(checkpoint_on=some_metric)
|
||||
result.a_prediction = some_prediction
|
||||
return result
|
||||
|
||||
def validation_epoch_end(self, validation_step_output_result):
|
||||
all_validation_step_predictions = validation_step_output_result.a_prediction
|
||||
# do something with the predictions from all validation_steps
|
||||
|
||||
return validation_step_output_result
|
||||
|
||||
|
||||
With the equivalent using the `EvalResult` syntax
|
||||
|
||||
|
||||
------------------
|
||||
|
||||
TrainResult
|
||||
-----------
|
||||
The `TrainResult` basic usage is this:
|
||||
|
||||
minimize
|
||||
^^^^^^^^
|
||||
When using TrainResult, the metric that needs to be minimized is passed to this
|
||||
argument. Internally, the tensor is verified to contain gradients and `.backward()`
|
||||
is called on it.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(...):
|
||||
return TrainResult(some_metric)
|
||||
|
||||
|
||||
checkpoint/early_stop
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
If you are only using a training loop (no val), you can also specify what to monitor for
|
||||
checkpointing or early stopping:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(...):
|
||||
return TrainResult(some_metric, checkpoint_on=metric_a, early_stop_on=metric_b)
|
||||
|
||||
|
||||
|
||||
logging
|
||||
^^^^^^^
|
||||
The main benefit of the `TrainResult` is automatic logging at whatever level you want.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result = TrainResult(loss)
|
||||
result.log('train_loss', loss)
|
||||
|
||||
# equivalent
|
||||
result.log('train_loss', loss, on_step=True, on_epoch=False, logger=True, prog_bar=False, reduce_fx=torch.mean)
|
||||
|
||||
By default, any log calls will log only that step's metrics to the logger. To change when and where to log
|
||||
update the defaults as needed.
|
||||
|
||||
Change where to log:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# to logger only (default)
|
||||
result.log('train_loss', loss)
|
||||
|
||||
# logger + progress bar
|
||||
result.log('train_loss', loss, prog_bar=True)
|
||||
|
||||
# progress bar only
|
||||
result.log('train_loss', loss, prog_bar=True, logger=False)
|
||||
|
||||
Sometimes you may also want to get epoch level statistics:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# loss at this step
|
||||
result.log('train_loss', loss)
|
||||
|
||||
# loss for the epoch
|
||||
result.log('train_loss', loss, on_step=False, on_epoch=True)
|
||||
|
||||
# loss for the epoch AND step
|
||||
# the logger will show 2 charts: step_train_loss, epoch_train_loss
|
||||
result.log('train_loss', loss, on_epoch=True)
|
||||
|
||||
Finally, you can use your own reduction function instead:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# the total sum for all batches of an epoch
|
||||
result.log('train_loss', loss, on_epoch=True, reduce_fx=torch.sum)
|
||||
|
||||
def my_reduce_fx(all_train_loss):
|
||||
# reduce somehow
|
||||
return result
|
||||
|
||||
result.log('train_loss', loss, on_epoch=True, reduce_fx=my_reduce_fx)
|
||||
|
||||
.. note:: Use this ONLY in the case where your loop is simple and simply logs.
|
||||
|
||||
Finally, you may need more esoteric logging such as something specific to your logger like images:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(...):
|
||||
result = TrainResult(some_metric)
|
||||
result.log('train_loss', loss)
|
||||
|
||||
# also log images (if tensorboard for example)
|
||||
self.logger.experiment.log_figure(...)
|
||||
|
||||
Sync across devices
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
When training on multiple GPUs/CPUs/TPU cores, calculate the global mean of a logged metric as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result.log('train_loss', loss, sync_dist=True)
|
||||
|
||||
TrainResult API
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. autoclass:: pytorch_lightning.core.step_result.TrainResult
|
||||
:noindex:
|
||||
|
||||
------------------
|
||||
|
||||
EvalResult
|
||||
----------
|
||||
The `EvalResult` object has the same usage as the `TrainResult` object.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_step(...):
|
||||
return EvalResult()
|
||||
|
||||
def test_step(...):
|
||||
return EvalResult()
|
||||
|
||||
However, there are some differences:
|
||||
|
||||
Eval minimize
|
||||
^^^^^^^^^^^^^
|
||||
- There is no `minimize` argument (since we don't learn during validation)
|
||||
|
||||
Eval checkpoint/early_stopping
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
If defined in both the `TrainResult` and the `EvalResult` the one in the `EvalResult` will take precedence.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def training_step(...):
|
||||
return TrainResult(loss, checkpoint_on=metric, early_stop_on=metric)
|
||||
|
||||
# metric_a and metric_b will be used for the callbacks and NOT metric
|
||||
def validation_step(...):
|
||||
return EvalResult(checkpoint_on=metric_a, early_stop_on=metric_b)
|
||||
|
||||
Eval logging
|
||||
^^^^^^^^^^^^
|
||||
Logging has the same behavior as `TrainResult` but the logging defaults are different:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# TrainResult logs by default at each step only
|
||||
TrainResult().log('val', val, on_step=True, on_epoch=False, logger=True, prog_bar=False, reduce_fx=torch.mean)
|
||||
|
||||
# EvalResult logs by default at the end of an epoch only
|
||||
EvalResult().log('val', val, on_step=False, on_epoch=True, logger=True, prog_bar=False, reduce_fx=torch.mean)
|
||||
|
||||
Val/Test loop
|
||||
^^^^^^^^^^^^^
|
||||
Eval result can be used in both `test_step` and `validation_step`.
|
||||
|
||||
Sync across devices (v)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
When training on multiple GPUs/CPUs/TPU cores, calculate the global mean of a logged metric as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
result.log('val_loss', loss, sync_dist=True)
|
||||
|
||||
EvalResult API
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. autoclass:: pytorch_lightning.core.step_result.EvalResult
|
||||
:noindex:
|
|
@ -53,7 +53,7 @@ class LitAutoEncoder(pl.LightningModule):
|
|||
z = self.encoder(x)
|
||||
x_hat = self.decoder(z)
|
||||
loss = F.mse_loss(x_hat, x)
|
||||
return pl.TrainResult(loss, checkpoint_on=loss)
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
|
||||
|
|
|
@ -60,17 +60,13 @@ class LitClassifier(pl.LightningModule):
|
|||
x, y = batch
|
||||
y_hat = self.backbone(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('valid_loss', loss)
|
||||
return result
|
||||
self.log('valid_loss', loss)
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
y_hat = self.backbone(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('test_loss', loss)
|
||||
return result
|
||||
self.log('test_loss', loss)
|
||||
|
||||
def configure_optimizers(self):
|
||||
# self.hparams available because we called self.save_hyperparameters()
|
||||
|
|
|
@ -49,17 +49,13 @@ class LitClassifier(pl.LightningModule):
|
|||
x, y = batch
|
||||
y_hat = self(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('valid_loss', loss)
|
||||
return result
|
||||
self.log('valid_loss', loss)
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
y_hat = self(x)
|
||||
loss = F.cross_entropy(y_hat, y)
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('test_loss', loss)
|
||||
return result
|
||||
self.log('test_loss', loss)
|
||||
|
||||
def configure_optimizers(self):
|
||||
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
|
||||
|
|
|
@ -20,7 +20,6 @@ from torch.utils.data import DataLoader
|
|||
from torchvision.datasets import MNIST
|
||||
|
||||
from pytorch_lightning.core import LightningModule
|
||||
from pytorch_lightning import TrainResult
|
||||
from pytorch_lightning.trainer import Trainer
|
||||
|
||||
|
||||
|
@ -128,13 +127,8 @@ class GAN(LightningModule):
|
|||
# adversarial loss is binary cross-entropy
|
||||
g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)
|
||||
tqdm_dict = {'g_loss': g_loss}
|
||||
result = TrainResult(
|
||||
minimize=g_loss,
|
||||
checkpoint_on=True
|
||||
)
|
||||
result.log_dict(tqdm_dict)
|
||||
|
||||
return result
|
||||
self.log_dict(tqdm_dict)
|
||||
return g_loss
|
||||
|
||||
# train discriminator
|
||||
if optimizer_idx == 1:
|
||||
|
@ -156,13 +150,9 @@ class GAN(LightningModule):
|
|||
# discriminator loss is the average of these
|
||||
d_loss = (real_loss + fake_loss) / 2
|
||||
tqdm_dict = {'d_loss': d_loss}
|
||||
result = TrainResult(
|
||||
minimize=d_loss,
|
||||
checkpoint_on=True
|
||||
)
|
||||
result.log_dict(tqdm_dict)
|
||||
self.log_dict(tqdm_dict)
|
||||
|
||||
return result
|
||||
return d_loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
lr = self.lr
|
||||
|
|
|
@ -28,7 +28,6 @@ from pytorch_lightning.core.grads import GradInformation
|
|||
from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
|
||||
from pytorch_lightning.core.memory import ModelSummary
|
||||
from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO
|
||||
from pytorch_lightning.core.step_result import EvalResult, TrainResult
|
||||
from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
|
||||
from pytorch_lightning.utilities import rank_zero_warn
|
||||
from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
|
||||
|
@ -117,7 +116,7 @@ class LightningModule(
|
|||
# optionally can be set by user
|
||||
self._example_input_array = None
|
||||
self._datamodule = None
|
||||
self._results = None
|
||||
self._results: Result = None
|
||||
self._current_fx_name = ''
|
||||
|
||||
@property
|
||||
|
@ -184,28 +183,28 @@ class LightningModule(
|
|||
|
||||
Example::
|
||||
|
||||
result.log('train_loss', loss)
|
||||
self.log('train_loss', loss)
|
||||
|
||||
# defaults used
|
||||
result.log(
|
||||
name,
|
||||
value,
|
||||
on_step=False,
|
||||
on_epoch=False,
|
||||
logger=True,
|
||||
prog_bar=False,
|
||||
reduce_fx=torch.mean,
|
||||
enable_graph=False
|
||||
)
|
||||
The default behavior per hook is as follows
|
||||
|
||||
.. csv-table:: ``*`` also applies to the test loop
|
||||
:header: "LightningMoule Hook", "on_step", "on_epoch", "prog_bar", "logger"
|
||||
:widths: 20, 10, 10, 10, 10
|
||||
|
||||
"training_step", "T", "F", "F", "T"
|
||||
"training_step_end", "T", "F", "F", "T"
|
||||
"training_epoch_end", "F", "T", "F", "T"
|
||||
"validation_step*", "F", "T", "F", "T"
|
||||
"validation_step_end*", "F", "T", "F", "T"
|
||||
"validation_epoch_end*", "F", "T", "F", "T"
|
||||
|
||||
Args:
|
||||
name: key name
|
||||
value: value name
|
||||
prog_bar: if True logs to the progress base
|
||||
prog_bar: if True logs to the progress bar
|
||||
logger: if True logs to the logger
|
||||
on_step: if True logs at this step. None auto-logs for training_step but not validation/test_step
|
||||
on_epoch: if True logs epoch accumulated metrics. None auto-logs for val/test step but not training_step
|
||||
on_step: if True logs at this step. None auto-logs at the training_step but not validation/test_step
|
||||
on_epoch: if True logs epoch accumulated metrics. None auto-logs at the val/test step but not training_step
|
||||
reduce_fx: Torch.mean by default
|
||||
tbptt_reduce_fx: function to reduce on truncated back prop
|
||||
tbptt_pad_token: token to use for padding
|
||||
|
@ -392,10 +391,7 @@ class LightningModule(
|
|||
:paramref:`~pytorch_lightning.trainer.trainer.Trainer.truncated_bptt_steps` > 0.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.TrainResult`
|
||||
|
||||
.. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient
|
||||
functions for logging, distributed sync and error checking.
|
||||
roch.Tensor or a dictionary with anything you want (must include the keyword 'loss')
|
||||
|
||||
In this step you'd normally do the forward pass and calculate the loss for a batch.
|
||||
You can also do fancier things like multiple forward passes or something model specific.
|
||||
|
@ -404,31 +400,9 @@ class LightningModule(
|
|||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y, z = batch
|
||||
|
||||
# implement your own
|
||||
out = self(x)
|
||||
out = self.encoder(x)
|
||||
loss = self.loss(out, x)
|
||||
|
||||
# TrainResult auto-detaches the loss after the optimization steps are complete
|
||||
result = pl.TrainResult(minimize=loss)
|
||||
|
||||
The return object :class:`~pytorch_lightning.core.step_result.TrainResult` controls where to log,
|
||||
when to log (step or epoch) and syncing with multiple GPUs.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# log to progress bar and logger
|
||||
result.log('train_loss', loss, prog_bar=True, logger=True)
|
||||
|
||||
# sync metric value across GPUs in distributed training
|
||||
result.log('train_loss_2', loss, sync_dist=True)
|
||||
|
||||
# log to progress bar as well
|
||||
result.log('train_loss_2', loss, prog_bar=True)
|
||||
|
||||
# assign arbitrary values
|
||||
result.predictions = predictions
|
||||
result.some_value = 'some_value'
|
||||
return loss
|
||||
|
||||
If you define multiple optimizers, this step will be called with an additional
|
||||
``optimizer_idx`` parameter.
|
||||
|
@ -454,10 +428,7 @@ class LightningModule(
|
|||
...
|
||||
out, hiddens = self.lstm(data, hiddens)
|
||||
...
|
||||
|
||||
# TrainResult auto-detaches hiddens
|
||||
result = pl.TrainResult(minimize=loss, hiddens=hiddens)
|
||||
return result
|
||||
return {'loss': loss, 'hiddens': hiddens}
|
||||
|
||||
Notes:
|
||||
The loss value shown in the progress bar is smoothed (averaged) over the last values,
|
||||
|
@ -488,10 +459,7 @@ class LightningModule(
|
|||
batch_parts_outputs: What you return in `training_step` for each batch part.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.TrainResult`
|
||||
|
||||
.. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient
|
||||
functions for logging, distributed sync and error checking.
|
||||
Anything
|
||||
|
||||
When using dp/ddp2 distributed backends, only a portion of the batch is inside the training_step:
|
||||
|
||||
|
@ -506,7 +474,7 @@ class LightningModule(
|
|||
# softmax uses only a portion of the batch in the denomintaor
|
||||
loss = self.softmax(out)
|
||||
loss = nce_loss(loss)
|
||||
return pl.TrainResult(loss)
|
||||
return loss
|
||||
|
||||
If you wish to do something with all the parts of the batch, then use this method to do it:
|
||||
|
||||
|
@ -516,24 +484,23 @@ class LightningModule(
|
|||
# batch is 1/num_gpus big
|
||||
x, y = batch
|
||||
|
||||
out = self(x)
|
||||
result = pl.TrainResult()
|
||||
result.out = out
|
||||
out = self.encoder(x)
|
||||
return {'pred': out}
|
||||
|
||||
def training_step_end(self, training_step_outputs):
|
||||
# this out is now the full size of the batch
|
||||
all_outs = training_step_outputs.out
|
||||
gpu_0_pred = training_step_outputs[0]['pred']
|
||||
gpu_1_pred = training_step_outputs[1]['pred']
|
||||
gpu_n_pred = training_step_outputs[n]['pred']
|
||||
|
||||
# this softmax now uses the full batch
|
||||
loss = nce_loss(all_outs)
|
||||
result = pl.TrainResult(loss)
|
||||
return result
|
||||
loss = nce_loss([gpu_0_pred, gpu_1_pred, gpu_n_pred])
|
||||
return loss
|
||||
|
||||
See Also:
|
||||
See the :ref:`multi_gpu` guide for more details.
|
||||
"""
|
||||
|
||||
def training_epoch_end(self, outputs: Union[TrainResult, List[TrainResult]]):
|
||||
def training_epoch_end(self, outputs: List[Any]):
|
||||
"""
|
||||
Called at the end of the training epoch with the outputs of all training steps.
|
||||
Use this in case you need to do something with all the outputs for every training_step.
|
||||
|
@ -552,10 +519,7 @@ class LightningModule(
|
|||
multiple dataloaders, a list containing a list of outputs for each dataloader.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.TrainResult`
|
||||
|
||||
.. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient
|
||||
functions for logging, distributed sync and error checking.
|
||||
None
|
||||
|
||||
Note:
|
||||
If this method is not overridden, this won't be called.
|
||||
|
@ -572,15 +536,12 @@ class LightningModule(
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
epoch_result = pl.TrainResult()
|
||||
for train_result in outputs:
|
||||
all_losses = train_result.minimize
|
||||
# do something with all losses
|
||||
return results
|
||||
def training_epoch_end(self, training_step_outputs):
|
||||
for out in training_step_outputs:
|
||||
# do something here
|
||||
"""
|
||||
|
||||
def validation_step(self, *args, **kwargs) -> EvalResult:
|
||||
def validation_step(self, *args, **kwargs):
|
||||
r"""
|
||||
Operates on a single batch of data from the validation set.
|
||||
In this step you'd might generate examples or calculate anything of interest like accuracy.
|
||||
|
@ -602,7 +563,7 @@ class LightningModule(
|
|||
(only if multiple val datasets used)
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None or whatever you want
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -643,9 +604,7 @@ class LightningModule(
|
|||
val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
|
||||
|
||||
# log the outputs!
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log_dict({'val_loss': loss, 'val_acc': val_acc})
|
||||
return result
|
||||
self.log_dict({'val_loss': loss, 'val_acc': val_acc})
|
||||
|
||||
If you pass in multiple val datasets, validation_step will have an additional argument.
|
||||
|
||||
|
@ -664,7 +623,7 @@ class LightningModule(
|
|||
the model goes back to training mode and gradients are enabled.
|
||||
"""
|
||||
|
||||
def validation_step_end(self, *args, **kwargs) -> EvalResult:
|
||||
def validation_step_end(self, *args, **kwargs):
|
||||
"""
|
||||
Use this when validating with dp or ddp2 because :meth:`validation_step`
|
||||
will operate on only part of the batch. However, this is still optional
|
||||
|
@ -686,7 +645,7 @@ class LightningModule(
|
|||
for each batch part.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None or anything
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -696,12 +655,10 @@ class LightningModule(
|
|||
# batch is 1/num_gpus big
|
||||
x, y = batch
|
||||
|
||||
out = self(x)
|
||||
out = self.encoder(x)
|
||||
loss = self.softmax(out)
|
||||
loss = nce_loss(loss)
|
||||
result = pl.EvalResult()
|
||||
result.log('val_loss', loss)
|
||||
return result
|
||||
self.log('val_loss', loss)
|
||||
|
||||
# --------------
|
||||
# with validation_step_end to do softmax over the full batch
|
||||
|
@ -710,18 +667,11 @@ class LightningModule(
|
|||
x, y = batch
|
||||
|
||||
out = self(x)
|
||||
result = pl.EvalResult()
|
||||
result.out = out
|
||||
return result
|
||||
return out
|
||||
|
||||
def validation_epoch_end(self, output_results):
|
||||
# this out is now the full size of the batch
|
||||
all_val_step_outs = output_results.out
|
||||
loss = nce_loss(all_val_step_outs)
|
||||
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('val_loss', loss)
|
||||
return result
|
||||
def validation_epoch_end(self, val_step_outputs):
|
||||
for out in val_step_outputs:
|
||||
# do something with these
|
||||
|
||||
See Also:
|
||||
See the :ref:`multi_gpu` guide for more details.
|
||||
|
@ -735,8 +685,8 @@ class LightningModule(
|
|||
"""
|
||||
|
||||
def validation_epoch_end(
|
||||
self, outputs: Union[EvalResult, List[EvalResult]]
|
||||
) -> EvalResult:
|
||||
self, outputs: List[Any]
|
||||
):
|
||||
"""
|
||||
Called at the end of the validation epoch with the outputs of all validation steps.
|
||||
|
||||
|
@ -754,25 +704,19 @@ class LightningModule(
|
|||
are multiple dataloaders, a list containing a list of outputs for each dataloader.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None
|
||||
|
||||
Note:
|
||||
If you didn't define a :meth:`validation_step`, this won't be called.
|
||||
|
||||
- The outputs here are strictly for logging or progress bar.
|
||||
- If you don't need to display anything, don't return anything.
|
||||
|
||||
Examples:
|
||||
With a single dataloader:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def validation_epoch_end(self, val_step_outputs):
|
||||
# do something with the outputs of all val batches
|
||||
all_val_preds = val_step_outputs.predictions
|
||||
|
||||
val_step_outputs.some_result = calc_all_results(all_val_preds)
|
||||
return val_step_outputs
|
||||
for out in val_step_outputs:
|
||||
# do something
|
||||
|
||||
With multiple dataloaders, `outputs` will be a list of lists. The outer list contains
|
||||
one entry per dataloader, while the inner list contains the individual outputs of
|
||||
|
@ -784,12 +728,10 @@ class LightningModule(
|
|||
for dataloader_output_result in outputs:
|
||||
dataloader_outs = dataloader_output_result.dataloader_i_outputs
|
||||
|
||||
result = pl.EvalResult()
|
||||
result.log('final_metric', final_value)
|
||||
return result
|
||||
self.log('final_metric', final_value)
|
||||
"""
|
||||
|
||||
def test_step(self, *args, **kwargs) -> EvalResult:
|
||||
def test_step(self, *args, **kwargs):
|
||||
r"""
|
||||
Operates on a single batch of data from the test set.
|
||||
In this step you'd normally generate examples or calculate anything of interest
|
||||
|
@ -812,7 +754,7 @@ class LightningModule(
|
|||
(only if multiple test datasets used).
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None or anything
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -844,9 +786,7 @@ class LightningModule(
|
|||
test_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
|
||||
|
||||
# log the outputs!
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log_dict({'test_loss': loss, 'test_acc': test_acc})
|
||||
return resultt
|
||||
self.log_dict({'test_loss': loss, 'test_acc': test_acc})
|
||||
|
||||
If you pass in multiple validation datasets, :meth:`test_step` will have an additional
|
||||
argument.
|
||||
|
@ -866,7 +806,7 @@ class LightningModule(
|
|||
to training mode and gradients are enabled.
|
||||
"""
|
||||
|
||||
def test_step_end(self, *args, **kwargs) -> EvalResult:
|
||||
def test_step_end(self, *args, **kwargs):
|
||||
"""
|
||||
Use this when testing with dp or ddp2 because :meth:`test_step` will operate
|
||||
on only part of the batch. However, this is still optional
|
||||
|
@ -887,7 +827,7 @@ class LightningModule(
|
|||
batch_parts_outputs: What you return in :meth:`test_step` for each batch part.
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None or anything
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -899,10 +839,7 @@ class LightningModule(
|
|||
|
||||
out = self(x)
|
||||
loss = self.softmax(out)
|
||||
loss = nce_loss(loss)
|
||||
result = pl.EvalResult()
|
||||
result.log('test_loss', loss)
|
||||
return result
|
||||
|
||||
# --------------
|
||||
# with test_step_end to do softmax over the full batch
|
||||
|
@ -910,19 +847,14 @@ class LightningModule(
|
|||
# batch is 1/num_gpus big
|
||||
x, y = batch
|
||||
|
||||
out = self(x)
|
||||
result = pl.EvalResult()
|
||||
result.out = out
|
||||
return result
|
||||
out = self.encoder(x)
|
||||
return out
|
||||
|
||||
def test_epoch_end(self, output_results):
|
||||
# this out is now the full size of the batch
|
||||
all_test_step_outs = output_results.out
|
||||
loss = nce_loss(all_test_step_outs)
|
||||
|
||||
result = pl.EvalResult(checkpoint_on=loss)
|
||||
result.log('test_loss', loss)
|
||||
return result
|
||||
self.log('test_loss', loss)
|
||||
|
||||
See Also:
|
||||
See the :ref:`multi_gpu` guide for more details.
|
||||
|
@ -936,8 +868,8 @@ class LightningModule(
|
|||
"""
|
||||
|
||||
def test_epoch_end(
|
||||
self, outputs: Union[EvalResult, List[EvalResult]]
|
||||
) -> EvalResult:
|
||||
self, outputs: List[Any]
|
||||
):
|
||||
"""
|
||||
Called at the end of a test epoch with the output of all test steps.
|
||||
|
||||
|
@ -955,14 +887,11 @@ class LightningModule(
|
|||
are multiple dataloaders, a list containing a list of outputs for each dataloader
|
||||
|
||||
Return:
|
||||
:class:`~pytorch_lightning.core.step_result.EvalResult`
|
||||
None
|
||||
|
||||
Note:
|
||||
If you didn't define a :meth:`test_step`, this won't be called.
|
||||
|
||||
- The outputs here are strictly for logging or progress bar.
|
||||
- If you don't need to display anything, don't return anything.
|
||||
|
||||
Examples:
|
||||
With a single dataloader:
|
||||
|
||||
|
@ -972,8 +901,8 @@ class LightningModule(
|
|||
# do something with the outputs of all test batches
|
||||
all_test_preds = test_step_outputs.predictions
|
||||
|
||||
test_step_outputs.some_result = calc_all_results(all_test_preds)
|
||||
return test_step_outputs
|
||||
some_result = calc_all_results(all_test_preds)
|
||||
self.log(some_result)
|
||||
|
||||
With multiple dataloaders, `outputs` will be a list of lists. The outer list contains
|
||||
one entry per dataloader, while the inner list contains the individual outputs of
|
||||
|
@ -982,12 +911,13 @@ class LightningModule(
|
|||
.. code-block:: python
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
for dataloader_output_result in outputs:
|
||||
dataloader_outs = dataloader_output_result.dataloader_i_outputs
|
||||
final_value = 0
|
||||
for dataloader_outputs in outputs:
|
||||
for test_step_out in dataloader_outputs:
|
||||
# do something
|
||||
final_value += test_step_out
|
||||
|
||||
result = pl.EvalResult()
|
||||
result.log('final_metric', final_value)
|
||||
return results
|
||||
self.log('final_metric', final_value)
|
||||
"""
|
||||
|
||||
def configure_ddp(
|
||||
|
|
Loading…
Reference in New Issue