deprecate enable_pl_optimizer as it is not restored properly (#5244)
* update
* clean test
* still in progress
* udpdate test
* update
* update
* resolve flake
* add test for zero_grad
* update
* works without accumulated_grad
* update
* update
* resolve amp
* revert back to True
* update
* clean tests
* cleaned out
* typo
* update test
* git repare bug
* remove print
* udpate
* Fix formatting/optimizer imports
* Refactor the test for cleanliness
* Add vanilla model to the test, better var names
* Fixed var names, let's clean up these mock tests
* repare test
* update test
* resolve flake8
* add manual_optimization
* update tests
* resolve flake8
* add random accumulate_grad_batches
* improve test
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* update
* clean tests
* correct bug
* Apply suggestions from code review
* format
* adress comments
* update on comments
* wip
* typo
* depreceate enable_pl_optimizer
* resolve latest bugs
* update
* resolve merge
* add comment
* Update pytorch_lightning/core/lightning.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update tests/deprecated_api/test_remove_1-3.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update pytorch_lightning/trainer/connectors/optimizer_connector.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update pytorch_lightning/trainer/trainer.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update pytorch_lightning/trainer/trainer.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
* update on comments
* update restore
* add a property
* remove setstate as not needed anymore
* update test
* provide optimizer to on_before_zero_grad
* update on comments
* update on comments
* Update pytorch_lightning/trainer/trainer.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* Update tests/trainer/optimization/test_parity_automatic_optimization.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* mofidy import
* update changelog
* resolve flake8
* update
* update
* clean doc
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
(cherry picked from commit f2e99d617f
)
This commit is contained in:
parent
cccbcc5420
commit
0435e23a64
|
@ -130,6 +130,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Changed
|
||||
|
||||
- Changed depreceated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244))
|
||||
|
||||
|
||||
### Deprecated
|
||||
|
||||
|
|
|
@ -224,7 +224,8 @@ with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
|
|||
```python
|
||||
class LitAutoEncoder(pl.LightningModule):
|
||||
def training_step(self, batch, batch_idx, opt_idx):
|
||||
(opt_a, opt_b) = self.optimizers()
|
||||
# access your optimizers with use_pl_optimizer=False. Default is True
|
||||
(opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
|
||||
|
||||
loss_a = ...
|
||||
self.manual_backward(loss_a, opt_a)
|
||||
|
|
|
@ -175,7 +175,8 @@ class SeedTrainLoaderModel(BoringModel):
|
|||
class SeedTrainLoaderManualModel(SeedTrainLoaderModel):
|
||||
def training_step(self, batch, batch_idx, optimizer_idx):
|
||||
# manual
|
||||
(opt_a, opt_b) = self.optimizers()
|
||||
# access your optimizers with use_pl_optimizer=False. Default is True
|
||||
(opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
|
||||
loss_1 = self.step(batch)
|
||||
|
||||
self.manual_backward(loss_1, opt_a)
|
||||
|
|
|
@ -269,7 +269,8 @@ Now you own the train loop!
|
|||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx, opt_idx):
|
||||
(opt_a, opt_b, opt_c) = self.optimizers()
|
||||
# access your optimizers with use_pl_optimizer=False. Default is True
|
||||
(opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True)
|
||||
|
||||
loss_a = self.generator(batch[0])
|
||||
|
||||
|
|
|
@ -28,8 +28,15 @@ to manually manage the optimization process. To do so, do the following:
|
|||
.. code-block:: python
|
||||
|
||||
def training_step(self, batch, batch_idx, optimizer_idx):
|
||||
# ignore optimizer_idx
|
||||
(opt_g, opt_d) = self.optimizers()
|
||||
|
||||
# 1. ignore optimizer_idx
|
||||
# 2. `use_pl_optimizer=True` means `opt_g` and `opt_d` will be of type `LightingOptimizer`
|
||||
# `LightingOptimizer` simply wrapped your optimizer and behave the same way !
|
||||
# When calling `optimizer.step`, `LightingOptimizer` will just handle TPU, AMP, accumulate_grad_batches, etc ... for you.
|
||||
|
||||
# access your optimizers with `use_pl_optimizer=False` or `optimizer.optimizer` when using use_pl_optimizer=True
|
||||
# use_pl_optimizer=True is the default
|
||||
(opt_g, opt_d) = self.optimizers(use_pl_optimizer=True)
|
||||
|
||||
# do anything you want
|
||||
loss_a = ...
|
||||
|
@ -242,19 +249,29 @@ Here we add a learning-rate warm up
|
|||
# update params
|
||||
optimizer.step(closure=closure)
|
||||
|
||||
The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step.
|
||||
.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ...
|
||||
|
||||
.. testcode::
|
||||
|
||||
from pytorch_lightning.core.optimizer import LightningOptimizer
|
||||
# function hook in LightningModule
|
||||
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
|
||||
optimizer.step(closure=closure)
|
||||
|
||||
.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
|
||||
|
||||
.. testcode::
|
||||
|
||||
# function hook in LightningModule
|
||||
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
|
||||
if not isinstance(optimizer, LightningOptimizer):
|
||||
# wraps into LightingOptimizer only for running step
|
||||
optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
|
||||
|
||||
# `optimizer is a ``LightningOptimizer`` wrapping the optimizer.
|
||||
# To access it, do as follow:
|
||||
optimizer = optimizer.optimizer
|
||||
|
||||
# run step. However, it won't work on TPU, AMP, etc...
|
||||
optimizer.step(closure=closure)
|
||||
|
||||
|
||||
----------
|
||||
|
||||
Using the closure functions for optimization
|
||||
|
|
|
@ -335,7 +335,8 @@ optimizer behavior
|
|||
Example::
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
opt = self.optimizers()
|
||||
# access your optimizers with use_pl_optimizer=False. Default is True
|
||||
opt = self.optimizers(use_pl_optimizer=True)
|
||||
|
||||
loss = ...
|
||||
self.manual_backward(loss, opt)
|
||||
|
@ -350,7 +351,8 @@ In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizer
|
|||
Example::
|
||||
|
||||
def training_step(self, batch, batch_idx, optimizer_idx):
|
||||
(opt_a, opt_b) = self.optimizers()
|
||||
# access your optimizers with use_pl_optimizer=False. Default is True
|
||||
(opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
|
||||
|
||||
gen_loss = ...
|
||||
self.manual_backward(gen_loss, opt_a)
|
||||
|
|
|
@ -48,8 +48,6 @@ class CPUAccelerator(Accelerator):
|
|||
# allow for lr schedulers as well
|
||||
self.setup_optimizers(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
self.trainer.model = model
|
||||
|
||||
def train(self):
|
||||
|
|
|
@ -189,8 +189,6 @@ class DDP2Accelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# device ids change depending on the DDP setup
|
||||
device_ids = self.get_device_ids()
|
||||
|
||||
|
|
|
@ -12,9 +12,9 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
import os
|
||||
from os.path import abspath
|
||||
import subprocess
|
||||
import sys
|
||||
from os.path import abspath
|
||||
from time import sleep
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
|
@ -292,8 +292,6 @@ class DDPAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# device ids change depending on the DDP setup
|
||||
device_ids = self.get_device_ids()
|
||||
|
||||
|
|
|
@ -148,8 +148,6 @@ class DDPCPUSpawnAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# DDP spawn already spawned off each process... no need to do anything
|
||||
device_ids = self.get_device_ids()
|
||||
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
from typing import Any, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as torch_distrib
|
||||
import torch.distributed as dist
|
||||
import torch.distributed as torch_distrib
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
from pytorch_lightning import _logger as log
|
||||
|
@ -177,8 +177,6 @@ class DDPHPCAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# device ids change depending on the DDP setup
|
||||
device_ids = self.get_device_ids()
|
||||
|
||||
|
|
|
@ -163,8 +163,6 @@ class DDPSpawnAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# device ids change depending on the DDP setup
|
||||
device_ids = self.get_device_ids()
|
||||
|
||||
|
|
|
@ -64,8 +64,6 @@ class DataParallelAccelerator(Accelerator):
|
|||
if self.trainer.amp_backend:
|
||||
model = self.__init_half_precision(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
self.trainer.model = model
|
||||
|
||||
def __init_torch_data_parallel(self, model):
|
||||
|
|
|
@ -53,8 +53,6 @@ class GPUAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
self.trainer.model = model
|
||||
|
||||
def train(self):
|
||||
|
|
|
@ -90,8 +90,6 @@ class HorovodAccelerator(Accelerator):
|
|||
# 16-bit
|
||||
model = self.trainer.precision_connector.connect(model)
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
# Update logger rank info from Horovod to avoid race conditions from different ranks
|
||||
# creating directories / writing files in the same locations.
|
||||
self.trainer.global_rank = hvd.rank()
|
||||
|
|
|
@ -30,6 +30,7 @@ from pytorch_lightning.utilities import (
|
|||
rank_zero_info,
|
||||
rank_zero_only,
|
||||
rank_zero_warn,
|
||||
TPU_AVAILABLE,
|
||||
)
|
||||
from pytorch_lightning.utilities.cloud_io import atomic_save
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
|
@ -229,8 +230,6 @@ class TPUAccelerator(Accelerator):
|
|||
f' global rank: {trainer.tpu_global_core_rank}'
|
||||
f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
|
||||
|
||||
self.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
|
||||
# do backward pass
|
||||
if self.trainer.train_loop.automatic_optimization:
|
||||
|
|
|
@ -104,8 +104,11 @@ class LightningModule(
|
|||
self._current_dataloader_idx = None
|
||||
self._automatic_optimization: bool = True
|
||||
|
||||
def optimizers(self):
|
||||
opts = self.trainer.optimizers
|
||||
def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]:
|
||||
if use_pl_optimizer:
|
||||
opts = list(self.trainer.lightning_optimizers.values())
|
||||
else:
|
||||
opts = self.trainer.optimizers
|
||||
|
||||
# single optimizer
|
||||
if isinstance(opts, list) and len(opts) == 1 and isinstance(opts[0], Optimizer):
|
||||
|
|
|
@ -63,6 +63,10 @@ class LightningOptimizer:
|
|||
self._accumulate_grad_batches = accumulate_grad_batches
|
||||
self._optimizer_idx = None
|
||||
|
||||
@property
|
||||
def optimizer(self):
|
||||
return self._optimizer
|
||||
|
||||
@property
|
||||
def defaults(self):
|
||||
return self._optimizer.defaults
|
||||
|
@ -103,9 +107,13 @@ class LightningOptimizer:
|
|||
break
|
||||
|
||||
@classmethod
|
||||
def to_lightning_optimizer(cls, optimizer, trainer):
|
||||
optimizer = cls(optimizer)
|
||||
optimizer._on_trainer_init(trainer)
|
||||
def _to_lightning_optimizer(cls, optimizer, trainer, opt_idx):
|
||||
# apex overrides .step function and need to be wrapped on each step
|
||||
if trainer.amp_backend == AMPType.APEX:
|
||||
optimizer = cls(optimizer)
|
||||
optimizer._on_trainer_init(trainer)
|
||||
else:
|
||||
optimizer = trainer.lightning_optimizers[opt_idx]
|
||||
return optimizer
|
||||
|
||||
def _accumulated_batches_reached(self):
|
||||
|
@ -147,7 +155,7 @@ class LightningOptimizer:
|
|||
**kwargs
|
||||
)
|
||||
|
||||
trainer.train_loop.on_before_zero_grad(self)
|
||||
trainer.train_loop.on_before_zero_grad(optimizer)
|
||||
|
||||
model.optimizer_zero_grad(
|
||||
trainer.current_epoch,
|
||||
|
|
|
@ -15,8 +15,8 @@ import os
|
|||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as torch_distrib
|
||||
from torch import nn
|
||||
import torch.distributed as torch_distrib
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
from pytorch_lightning import _logger as log
|
||||
|
@ -28,6 +28,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
|||
if _FAIRSCALE_PIPE_AVAILABLE:
|
||||
import fairscale.nn.model_parallel as mpu
|
||||
from fairscale.nn import PipeRPCWrapper
|
||||
import fairscale.nn.model_parallel as mpu
|
||||
from fairscale.nn.pipe import balance as pipe_balance
|
||||
from fairscale.nn.pipe import rpc as rpc_pipe
|
||||
from fairscale.nn.pipe.pipeline import PipelineStyle
|
||||
|
@ -383,7 +384,6 @@ def register_optimizers(ctx, model):
|
|||
model.trainer.optimizers = optimizers
|
||||
model.trainer.lr_schedulers = lr_schedulers
|
||||
model.trainer.optimizer_frequencies = optimizer_frequencies
|
||||
model.trainer.convert_to_lightning_optimizers()
|
||||
|
||||
|
||||
def run_optimizer(ctx, model):
|
||||
|
|
|
@ -52,7 +52,10 @@ class NativeAMPPlugin(PrecisionPlugin):
|
|||
|
||||
# unscale gradient to allow analyze within `on_after_backward`
|
||||
if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
|
||||
self.trainer.scaler.unscale_(optimizer)
|
||||
if isinstance(optimizer, LightningOptimizer):
|
||||
self.trainer.scaler.unscale_(optimizer.optimizer)
|
||||
else:
|
||||
self.trainer.scaler.unscale_(optimizer)
|
||||
|
||||
return closure_loss
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ class DDPShardedPlugin(DDPPlugin):
|
|||
optimizers = trainer.optimizers
|
||||
for x, optimizer in enumerate(optimizers):
|
||||
if is_lightning_optimizer(optimizer):
|
||||
optimizer = optimizer._optimizer
|
||||
optimizer = optimizer.optimizer
|
||||
if not isinstance(optimizer, OSS):
|
||||
optim_class = type(optimizer)
|
||||
zero_optimizer = OSS(
|
||||
|
@ -73,7 +73,6 @@ class DDPShardedPlugin(DDPPlugin):
|
|||
)
|
||||
optimizers[x] = zero_optimizer
|
||||
del optimizer
|
||||
trainer.convert_to_lightning_optimizers()
|
||||
|
||||
def get_model_from_plugin(
|
||||
self,
|
||||
|
|
|
@ -72,17 +72,7 @@ class ConfigValidator(object):
|
|||
|
||||
trainer.overriden_optimizer_step = is_overridden('optimizer_step', model)
|
||||
trainer.overriden_optimizer_zero_grad = is_overridden('optimizer_zero_grad', model)
|
||||
|
||||
enable_pl_optimizer = trainer._enable_pl_optimizer
|
||||
automatic_optimization = trainer.train_loop.automatic_optimization
|
||||
if trainer.overriden_optimizer_step and not enable_pl_optimizer and automatic_optimization:
|
||||
rank_zero_warn(
|
||||
"When overriding `LightningModule` optimizer_step with"
|
||||
" `Trainer(..., enable_pl_optimizer=False, ...)`,"
|
||||
" we won't be calling `.zero_grad` we can't assume when you call your `optimizer.step()`."
|
||||
" For Lightning to take care of it, please use `Trainer(enable_pl_optimizer=True)`."
|
||||
)
|
||||
|
||||
going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches()
|
||||
|
||||
has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad
|
||||
|
@ -93,13 +83,6 @@ class ConfigValidator(object):
|
|||
' It ensures optimizer_step or optimizer_zero_grad are called on every batch.'
|
||||
)
|
||||
|
||||
if (enable_pl_optimizer) and trainer.overriden_optimizer_zero_grad and not automatic_optimization:
|
||||
raise MisconfigurationException(
|
||||
'When overriding `LightningModule` optimizer_zero_grad'
|
||||
' and preserving model property `automatic_optimization` as True with'
|
||||
' `Trainer(enable_pl_optimizer=True, ...) is not supported'
|
||||
)
|
||||
|
||||
def __verify_eval_loop_configuration(self, model, eval_loop_name):
|
||||
step_name = f'{eval_loop_name}_step'
|
||||
|
||||
|
|
|
@ -20,7 +20,11 @@ class OptimizerConnector:
|
|||
self.trainer = trainer
|
||||
|
||||
def on_trainer_init(self, enable_pl_optimizer):
|
||||
self.trainer._enable_pl_optimizer = enable_pl_optimizer
|
||||
if enable_pl_optimizer is not None:
|
||||
rank_zero_warn(
|
||||
"Trainer argument `enable_pl_optimizer` is deprecated in v1.1.3. It will be removed in v1.3.0",
|
||||
DeprecationWarning
|
||||
)
|
||||
self.trainer.lr_schedulers = []
|
||||
self.trainer.optimizers = []
|
||||
self.trainer.optimizer_frequencies = []
|
||||
|
|
|
@ -67,7 +67,6 @@ class PrecisionConnector:
|
|||
self.trainer.amp_backend = AMPType.APEX
|
||||
self.backend = ApexPlugin(self.trainer)
|
||||
log.warn("LightningOptimizer doesn't support Apex")
|
||||
self.trainer._enable_pl_optimizer = False
|
||||
|
||||
if not self.trainer.amp_backend:
|
||||
raise ModuleNotFoundError(
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
from abc import ABC
|
||||
from collections import OrderedDict
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
@ -88,8 +89,10 @@ class TrainerOptimizersMixin(ABC):
|
|||
optimizer._on_trainer_init(trainer)
|
||||
return optimizer
|
||||
|
||||
if self._enable_pl_optimizer:
|
||||
self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers]
|
||||
self._lightning_optimizers = {
|
||||
opt_idx: _convert_to_lightning_optimizer(self, opt)
|
||||
for opt_idx, opt in enumerate(self.optimizers)
|
||||
}
|
||||
|
||||
def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
|
||||
# Convert each scheduler into dict structure with relevant information
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import inspect
|
||||
import os
|
||||
from abc import ABC
|
||||
from argparse import ArgumentParser, Namespace
|
||||
import inspect
|
||||
import os
|
||||
from typing import cast, List, Optional, Type, TypeVar, Union
|
||||
|
||||
from pytorch_lightning.accelerators.accelerator import Accelerator
|
||||
|
@ -66,6 +66,7 @@ class TrainerProperties(ABC):
|
|||
callbacks: List[Callback]
|
||||
num_nodes: int
|
||||
num_processes: int
|
||||
_lightning_optimizers = None
|
||||
|
||||
@property
|
||||
def log_dir(self):
|
||||
|
@ -267,15 +268,16 @@ class TrainerProperties(ABC):
|
|||
def get_model(self):
|
||||
return self.model_connector.get_model()
|
||||
|
||||
def __getstate__(self):
|
||||
# unwrap optimizer
|
||||
self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
|
||||
return self.__dict__
|
||||
@property
|
||||
def lightning_optimizers(self):
|
||||
if self._lightning_optimizers is None:
|
||||
self.convert_to_lightning_optimizers()
|
||||
return self._lightning_optimizers
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
# wrap optimizers in enable_pl_optimzer is True
|
||||
self.convert_to_lightning_optimizers()
|
||||
def __getstate__(self):
|
||||
# remove lightning_optimizers
|
||||
self._lightning_optimizers = None
|
||||
return self.__dict__
|
||||
|
||||
@property
|
||||
def require_distributed_sampler(self):
|
||||
|
|
|
@ -134,7 +134,7 @@ class Trainer(
|
|||
distributed_backend: Optional[str] = None,
|
||||
automatic_optimization: Optional[bool] = None,
|
||||
move_metrics_to_cpu: bool = False,
|
||||
enable_pl_optimizer: bool = False,
|
||||
enable_pl_optimizer: bool = None, # todo: remove in v1.3
|
||||
multiple_trainloader_mode: str = 'max_size_cycle',
|
||||
):
|
||||
r"""
|
||||
|
@ -285,7 +285,8 @@ class Trainer(
|
|||
|
||||
enable_pl_optimizer: If True, each optimizer will be wrapped by
|
||||
`pytorch_lightning.core.optimizer.LightningOptimizer`. It allows Lightning to
|
||||
handle AMP, TPU, accumulated_gradients, etc..
|
||||
handle AMP, TPU, accumulated_gradients, etc.
|
||||
.. warning:: Currently deprecated and it will be removed in v1.3
|
||||
|
||||
multiple_trainloader_mode: How to loop over the datasets when there are multiple train loaders.
|
||||
In 'max_size_cycle' mode, the trainer ends one epoch when the largest dataset is traversed,
|
||||
|
|
|
@ -499,6 +499,9 @@ class TrainLoop:
|
|||
'native PyTorch amp and lbfgs are not compatible.'
|
||||
' To request, please file a Github issue in PyTorch and tag @mcarilli')
|
||||
|
||||
# wraps into LightingOptimizer only for running step
|
||||
optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx)
|
||||
|
||||
# model hook
|
||||
model_ref.optimizer_step(
|
||||
self.trainer.current_epoch,
|
||||
|
|
|
@ -33,8 +33,6 @@ def test_trainer_callback_system(torch_save):
|
|||
limit_train_batches=3,
|
||||
limit_test_batches=2,
|
||||
progress_bar_refresh_rate=0,
|
||||
# todo: enabled since internally we wrap the model for optimizer step, this should be fixed
|
||||
enable_pl_optimizer=True
|
||||
)
|
||||
|
||||
# no call yet
|
||||
|
|
|
@ -561,8 +561,7 @@ def test_checkpointing_with_nan_as_first(tmpdir, mode):
|
|||
|
||||
|
||||
@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
|
||||
def test_checkpoint_repeated_strategy(tmpdir):
|
||||
"""
|
||||
This test validates that the checkpoint can be called when provided to callbacks list
|
||||
"""
|
||||
|
@ -582,7 +581,6 @@ def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
|
|||
limit_val_batches=2,
|
||||
limit_test_batches=2,
|
||||
callbacks=[checkpoint_callback],
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
weights_summary=None,
|
||||
progress_bar_refresh_rate=0,
|
||||
)
|
||||
|
@ -599,7 +597,6 @@ def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
|
|||
limit_val_batches=2,
|
||||
limit_test_batches=2,
|
||||
resume_from_checkpoint=checkpoint_callback.best_model_path,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
weights_summary=None,
|
||||
progress_bar_refresh_rate=0,
|
||||
)
|
||||
|
@ -610,8 +607,7 @@ def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
|
|||
|
||||
|
||||
@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir):
|
||||
def test_checkpoint_repeated_strategy_extended(tmpdir):
|
||||
"""
|
||||
This test validates checkpoint can be called several times without
|
||||
increasing internally its global step if nothing run.
|
||||
|
@ -656,7 +652,6 @@ def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=limit_train_batches,
|
||||
limit_val_batches=3,
|
||||
limit_test_batches=4,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
callbacks=[checkpoint_cb],
|
||||
)
|
||||
trainer = pl.Trainer(**trainer_config)
|
||||
|
|
|
@ -22,15 +22,13 @@ from pytorch_lightning.core.optimizer import LightningOptimizer
|
|||
from tests.base import BoringModel
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_model_torch_save(tmpdir, enable_pl_optimizer):
|
||||
def test_model_torch_save(tmpdir):
|
||||
"""Test to ensure torch save does not fail for model and trainer."""
|
||||
model = BoringModel()
|
||||
num_epochs = 1
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=num_epochs,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
temp_path = os.path.join(tmpdir, 'temp.pt')
|
||||
trainer.fit(model)
|
||||
|
@ -39,8 +37,6 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer):
|
|||
torch.save(trainer.model, temp_path)
|
||||
torch.save(trainer, temp_path)
|
||||
trainer = torch.load(temp_path)
|
||||
is_lightning_optimizer = isinstance(trainer.optimizers[0], LightningOptimizer)
|
||||
assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer
|
||||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
|
||||
|
|
|
@ -41,8 +41,7 @@ def test_automatic_optimization(tmpdir):
|
|||
trainer.fit(model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir):
|
||||
def test_automatic_optimization_num_calls(tmpdir):
|
||||
|
||||
with patch("torch.optim.SGD.step") as sgd_step, \
|
||||
patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \
|
||||
|
@ -71,16 +70,12 @@ def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir):
|
|||
if batch_idx % 2 == 0:
|
||||
assert isinstance(optimizer, SGD)
|
||||
optimizer.step(closure=optimizer_closure)
|
||||
if not enable_pl_optimizer:
|
||||
optimizer.zero_grad()
|
||||
|
||||
# update discriminator opt every 4 steps
|
||||
if optimizer_idx == 1:
|
||||
if batch_idx % 4 == 0:
|
||||
assert isinstance(optimizer, Adam)
|
||||
optimizer.step(closure=optimizer_closure)
|
||||
if not enable_pl_optimizer:
|
||||
optimizer.zero_grad()
|
||||
|
||||
model = TestModel()
|
||||
model.training_epoch_end = None
|
||||
|
@ -91,7 +86,6 @@ def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=8,
|
||||
limit_val_batches=1,
|
||||
accumulate_grad_batches=1,
|
||||
enable_pl_optimizer=enable_pl_optimizer
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -102,8 +96,7 @@ def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir):
|
|||
assert adam_zero_grad.call_count == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_params_groups_and_state_are_accessible(enable_pl_optimizer, tmpdir):
|
||||
def test_params_groups_and_state_are_accessible(tmpdir):
|
||||
|
||||
class TestModel(BoringModel):
|
||||
|
||||
|
@ -136,7 +129,6 @@ def test_params_groups_and_state_are_accessible(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=8,
|
||||
limit_val_batches=1,
|
||||
accumulate_grad_batches=1,
|
||||
enable_pl_optimizer=enable_pl_optimizer
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
|
|
@ -40,13 +40,12 @@ def test_lightning_optimizer(tmpdir):
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
groups = "{'dampening': 0, 'initial_lr': 0.1, 'lr': 0.01, 'momentum': 0, 'nesterov': False, 'weight_decay': 0}"
|
||||
expected = f"LightningSGD(groups=[{groups}])"
|
||||
assert trainer.optimizers[0].__repr__() == expected
|
||||
assert trainer._lightning_optimizers[0].__repr__() == expected
|
||||
|
||||
|
||||
def test_lightning_optimizer_from_user(tmpdir):
|
||||
|
@ -68,13 +67,12 @@ def test_lightning_optimizer_from_user(tmpdir):
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
groups = "{'amsgrad': False, 'betas': (0.9, 0.999), 'eps': 1e-08, 'initial_lr': 0.1, 'lr': 0.01, 'weight_decay': 0}"
|
||||
expected = f"LightningAdam(groups=[{groups}])"
|
||||
assert trainer.optimizers[0].__repr__() == expected
|
||||
assert trainer._lightning_optimizers[0].__repr__() == expected
|
||||
|
||||
|
||||
@patch("torch.optim.Adam.step", autospec=True)
|
||||
|
@ -122,7 +120,6 @@ def test_lightning_optimizer_manual_optimization(mock_sgd_step, mock_adam_step,
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -176,7 +173,6 @@ def test_lightning_optimizer_manual_optimization_and_accumulated_gradients(mock_
|
|||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -259,7 +255,6 @@ def test_lightning_optimizer_automatic_optimization(tmpdir):
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -312,7 +307,6 @@ def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad(tmpdir):
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -372,7 +366,6 @@ def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad_make_opt
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -425,7 +418,6 @@ def test_lightning_optimizer_automatic_optimization_make_optimizer_step_2(tmpdir
|
|||
limit_val_batches=1,
|
||||
max_epochs=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
|
|
@ -134,3 +134,8 @@ def test_v1_3_0_trainer_cli_profiler(cli_args, expected_parsed_arg, expected_pro
|
|||
assert getattr(args, "profiler") == expected_parsed_arg
|
||||
trainer = Trainer.from_argparse_args(args)
|
||||
assert isinstance(trainer.profiler, expected_profiler)
|
||||
|
||||
|
||||
def test_trainer_enable_pl_optimizer(tmpdir):
|
||||
with pytest.deprecated_call(match='will be removed in v1.3'):
|
||||
Trainer(enable_pl_optimizer=True)
|
||||
|
|
|
@ -17,13 +17,13 @@ from unittest import mock
|
|||
import pytest
|
||||
import torch
|
||||
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.trainer.states import TrainerState
|
||||
from pytorch_lightning.utilities import _APEX_AVAILABLE
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.base import EvalModelTemplate
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
|
||||
|
||||
@pytest.mark.skip(reason='dp + amp not supported currently') # TODO
|
||||
|
@ -144,8 +144,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
|
|||
assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir):
|
||||
def test_cpu_model_with_amp(tmpdir):
|
||||
"""Make sure model trains on CPU."""
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
|
@ -154,7 +153,6 @@ def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=0.4,
|
||||
limit_val_batches=0.4,
|
||||
precision=16,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
|
|
|
@ -11,23 +11,22 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from distutils.version import LooseVersion
|
||||
import os
|
||||
import platform
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint
|
||||
from pytorch_lightning.trainer.states import TrainerState
|
||||
from tests.base import BoringModel
|
||||
from tests.base import EvalModelTemplate
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
|
||||
def test_cpu_slurm_save_load(tmpdir):
|
||||
"""Verify model save/load/checkpoint on CPU."""
|
||||
model = BoringModel()
|
||||
|
||||
|
@ -43,7 +42,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=0.2,
|
||||
limit_val_batches=0.2,
|
||||
callbacks=[ModelCheckpoint(dirpath=tmpdir)],
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
trainer.fit(model)
|
||||
real_global_step = trainer.global_step
|
||||
|
@ -89,7 +87,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
|
|||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
logger=logger,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
callbacks=[_StartCallback(), ModelCheckpoint(dirpath=tmpdir)],
|
||||
)
|
||||
# by calling fit again, we trigger training, loading weights from the cluster
|
||||
|
@ -97,8 +94,7 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
|
|||
trainer.fit(model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
|
||||
def test_early_stopping_cpu_model(tmpdir):
|
||||
class ModelTrainVal(BoringModel):
|
||||
def validation_epoch_end(self, outputs) -> None:
|
||||
val_loss = torch.stack([x["x"] for x in outputs]).mean()
|
||||
|
@ -111,7 +107,6 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
|
|||
gradient_clip_val=1.0,
|
||||
overfit_batches=0.20,
|
||||
track_grad_norm=2,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
progress_bar_refresh_rate=0,
|
||||
accumulate_grad_batches=2,
|
||||
limit_train_batches=0.1,
|
||||
|
@ -131,8 +126,7 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
|
|||
@pytest.mark.skipif((platform.system() == "Darwin" and
|
||||
LooseVersion(torch.__version__) < LooseVersion("1.3.0")),
|
||||
reason="Distributed training is not supported on MacOS before Torch 1.3.0")
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir):
|
||||
def test_multi_cpu_model_ddp(tmpdir):
|
||||
"""Make sure DDP works."""
|
||||
tutils.set_random_master_port()
|
||||
|
||||
|
@ -145,7 +139,6 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir):
|
|||
gpus=None,
|
||||
num_processes=2,
|
||||
accelerator='ddp_cpu',
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
|
||||
model = BoringModel()
|
||||
|
@ -300,6 +293,25 @@ def test_cpu_model(tmpdir):
|
|||
progress_bar_refresh_rate=0,
|
||||
max_epochs=1,
|
||||
limit_train_batches=0.4,
|
||||
limit_val_batches=0.4
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
|
||||
tpipes.run_model_test(trainer_options, model, on_gpu=False)
|
||||
|
||||
|
||||
def test_all_features_cpu_model(tmpdir):
|
||||
"""Test each of the trainer options."""
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
gradient_clip_val=1.0,
|
||||
overfit_batches=0.20,
|
||||
track_grad_norm=2,
|
||||
progress_bar_refresh_rate=0,
|
||||
accumulate_grad_batches=2,
|
||||
max_epochs=1,
|
||||
limit_train_batches=0.4,
|
||||
limit_val_batches=0.4,
|
||||
)
|
||||
|
||||
|
|
|
@ -20,11 +20,9 @@ import sys
|
|||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from sklearn.metrics import accuracy_score
|
||||
import torch
|
||||
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator
|
||||
from pytorch_lightning.metrics.classification.accuracy import Accuracy
|
||||
|
@ -32,6 +30,8 @@ from pytorch_lightning.trainer.states import TrainerState
|
|||
from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
|
||||
from tests.base import EvalModelTemplate
|
||||
from tests.base.boring_model import BoringModel
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
from tests.base.models import BasicGAN
|
||||
|
||||
if _HOROVOD_AVAILABLE:
|
||||
|
@ -69,8 +69,7 @@ def _run_horovod(trainer_options, on_gpu=False):
|
|||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_horovod_cpu(enable_pl_optimizer, tmpdir):
|
||||
def test_horovod_cpu(tmpdir):
|
||||
"""Test Horovod running multi-process on CPU."""
|
||||
trainer_options = dict(
|
||||
default_root_dir=str(tmpdir),
|
||||
|
@ -82,14 +81,12 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir):
|
|||
limit_val_batches=0.2,
|
||||
accelerator='horovod',
|
||||
deterministic=True,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
_run_horovod(trainer_options)
|
||||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
|
||||
def test_horovod_cpu_implicit(tmpdir):
|
||||
"""Test Horovod without specifying a backend, inferring from env set by `horovodrun`."""
|
||||
trainer_options = dict(
|
||||
default_root_dir=str(tmpdir),
|
||||
|
@ -100,7 +97,6 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
|
|||
limit_train_batches=0.4,
|
||||
limit_val_batches=0.2,
|
||||
deterministic=True,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
_run_horovod(trainer_options)
|
||||
|
||||
|
@ -206,8 +202,7 @@ def test_horovod_transfer_batch_to_gpu(tmpdir):
|
|||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir):
|
||||
def test_horovod_multi_optimizer(tmpdir):
|
||||
model = BasicGAN(**EvalModelTemplate.get_default_hparams())
|
||||
|
||||
# fit model
|
||||
|
@ -219,7 +214,6 @@ def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir):
|
|||
limit_val_batches=0.2,
|
||||
deterministic=True,
|
||||
accelerator='horovod',
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
trainer.fit(model)
|
||||
assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
|
||||
|
@ -241,8 +235,7 @@ def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir):
|
|||
|
||||
@pytest.mark.skipif(not _HOROVOD_AVAILABLE, reason="Horovod is unavailable")
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_result_reduce_horovod(enable_pl_optimizer, tmpdir):
|
||||
def test_result_reduce_horovod(tmpdir):
|
||||
"""Make sure result logging works with Horovod.
|
||||
|
||||
This test mirrors tests/core/test_results.py::_ddp_test_fn
|
||||
|
@ -282,7 +275,6 @@ def test_result_reduce_horovod(enable_pl_optimizer, tmpdir):
|
|||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
weights_summary=None,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
|
|
@ -11,22 +11,22 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from copy import deepcopy
|
||||
import glob
|
||||
import logging as log
|
||||
import os
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
|
||||
import cloudpickle
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer
|
||||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
import tests.base.develop_pipelines as tpipes
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Callback, Trainer
|
||||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
from pytorch_lightning.trainer.states import TrainerState
|
||||
from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate
|
||||
from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
|
||||
|
||||
|
||||
class ModelTrainerPropertyParity(Callback):
|
||||
|
@ -51,8 +51,7 @@ class ModelTrainerPropertyParity(Callback):
|
|||
self._check_properties(trainer, pl_module)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
||||
def test_model_properties_resume_from_checkpoint(tmpdir):
|
||||
""" Test that properties like `current_epoch` and `global_step`
|
||||
in model and trainer are always the same. """
|
||||
model = EvalModelTemplate()
|
||||
|
@ -61,7 +60,6 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
|||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
logger=False,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
callbacks=[checkpoint_callback, ModelTrainerPropertyParity()], # this performs the assertions
|
||||
)
|
||||
trainer = Trainer(**trainer_args)
|
||||
|
@ -98,8 +96,7 @@ class CaptureCallbacksBeforeTraining(Callback):
|
|||
self.callbacks = deepcopy(trainer.callbacks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
||||
def test_callbacks_state_resume_from_checkpoint(tmpdir):
|
||||
""" Test that resuming from a checkpoint restores callbacks that persist state. """
|
||||
model = EvalModelTemplate()
|
||||
callback_capture = CaptureCallbacksBeforeTraining()
|
||||
|
@ -110,7 +107,6 @@ def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
|||
default_root_dir=tmpdir,
|
||||
max_steps=1,
|
||||
logger=False,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
callbacks=[
|
||||
checkpoint,
|
||||
callback_capture,
|
||||
|
@ -137,11 +133,10 @@ def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
|||
assert before.best_model_score == after.best_model_score
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
|
||||
def test_callbacks_references_resume_from_checkpoint(tmpdir):
|
||||
""" Test that resuming from a checkpoint sets references as expected. """
|
||||
model = EvalModelTemplate()
|
||||
args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False, "enable_pl_optimizer": enable_pl_optimizer}
|
||||
args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}
|
||||
|
||||
# initial training
|
||||
checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
|
||||
|
|
|
@ -457,7 +457,6 @@ def test_manual_optimization_and_return_tensor(tmpdir):
|
|||
amp_backend='native',
|
||||
accelerator="ddp_spawn",
|
||||
gpus=2,
|
||||
enable_pl_optimizer=True
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -576,7 +575,6 @@ def test_manual_optimization_and_accumulated_gradient(tmpdir):
|
|||
amp_backend='native',
|
||||
accumulate_grad_batches=4,
|
||||
gpus=1,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
|
@ -651,7 +649,6 @@ def test_multiple_optimizers_step(tmpdir):
|
|||
precision=16,
|
||||
amp_backend='native',
|
||||
gpus=1,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -733,7 +730,6 @@ def test_step_with_optimizer_closure(tmpdir):
|
|||
limit_val_batches=2,
|
||||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -798,7 +794,6 @@ def test_step_with_optimizer_closure_and_accumulated_grad(tmpdir):
|
|||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -854,7 +849,6 @@ def test_step_with_optimizer_closure_and_extra_arguments(step_mock, tmpdir):
|
|||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -932,7 +926,6 @@ def test_step_with_optimizer_closure_with_different_frequencies(mock_sgd_step, m
|
|||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
@ -1041,7 +1034,6 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_ste
|
|||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
gpus=2,
|
||||
accelerator="ddp",
|
||||
)
|
||||
|
@ -1052,35 +1044,3 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_ste
|
|||
|
||||
expected_calls = [call(closure=ANY, optim='adam')] * 2
|
||||
mock_adam_step.assert_has_calls(expected_calls)
|
||||
|
||||
|
||||
def test_step_with_misconfiguraiton_error_when_overriding_optimizer_zero_grad(tmpdir):
|
||||
"""
|
||||
Tests that `optimizer_zero_grad` in manual_optimization triggers a MisconfigurationException
|
||||
"""
|
||||
try:
|
||||
class TestModel(BoringModel):
|
||||
|
||||
def optimizer_zero_grad(self, *_):
|
||||
pass
|
||||
|
||||
@property
|
||||
def automatic_optimization(self) -> bool:
|
||||
return False
|
||||
|
||||
model = TestModel()
|
||||
model.val_dataloader = None
|
||||
model.training_epoch_end = None
|
||||
|
||||
limit_train_batches = 8
|
||||
Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
limit_train_batches=limit_train_batches,
|
||||
limit_val_batches=2,
|
||||
max_epochs=1,
|
||||
log_every_n_steps=1,
|
||||
accumulate_grad_batches=2,
|
||||
enable_pl_optimizer=True,
|
||||
)
|
||||
except MisconfigurationException as ex:
|
||||
assert "`Trainer(enable_pl_optimizer=True, ...) is not supported" in str(ex)
|
||||
|
|
|
@ -181,9 +181,8 @@ def test_reducelronplateau_scheduling(tmpdir):
|
|||
), 'lr scheduler was not correctly converted to dict'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
def test_optimizer_return_options(enable_pl_optimizer):
|
||||
trainer = Trainer(enable_pl_optimizer=enable_pl_optimizer)
|
||||
def test_optimizer_return_options():
|
||||
trainer = Trainer()
|
||||
model = EvalModelTemplate()
|
||||
|
||||
# single optimizer
|
||||
|
|
|
@ -11,22 +11,20 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
from argparse import Namespace
|
||||
from copy import deepcopy
|
||||
from distutils.version import LooseVersion
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import sys
|
||||
from unittest.mock import ANY, call, patch
|
||||
|
||||
import cloudpickle
|
||||
from omegaconf import OmegaConf
|
||||
import pytest
|
||||
import torch
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Callback, LightningModule, Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
|
||||
|
@ -38,6 +36,7 @@ from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
|
|||
from pytorch_lightning.utilities.cloud_io import load as pl_load
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.base import BoringModel, EvalModelTemplate
|
||||
import tests.base.develop_utils as tutils
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -504,16 +503,13 @@ def test_model_checkpoint_only_weights(tmpdir):
|
|||
|
||||
|
||||
def test_model_freeze_unfreeze():
|
||||
|
||||
model = EvalModelTemplate()
|
||||
|
||||
model.freeze()
|
||||
model.unfreeze()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
|
||||
@pytest.mark.parametrize("url_ckpt", [True, False])
|
||||
def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt, enable_pl_optimizer):
|
||||
def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
|
||||
"""Verify resuming from checkpoint runs the right number of epochs"""
|
||||
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
|
||||
monkeypatch.setenv("TORCH_HOME", tmpdir)
|
||||
|
@ -541,7 +537,6 @@ def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_serve
|
|||
callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)],
|
||||
default_root_dir=tmpdir,
|
||||
val_check_interval=1.0,
|
||||
enable_pl_optimizer=enable_pl_optimizer,
|
||||
progress_bar_refresh_rate=0,
|
||||
logger=False,
|
||||
weights_summary=None,
|
||||
|
|
Loading…
Reference in New Issue