lightning/pytorch_lightning/plugins/apex.py

145 lines
5.2 KiB
Python
Raw Normal View History

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple, Union
2020-10-19 20:20:17 +00:00
import torch
from torch.optim.optimizer import Optimizer
2020-10-19 20:20:17 +00:00
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType
from pytorch_lightning.utilities.distributed import rank_zero_warn
if _APEX_AVAILABLE:
from apex import amp
class ApexPlugin(PrecisionPlugin):
def __init__(self, trainer=None):
self.trainer = trainer
def connect(self, model, optimizers):
model, optimizers = self.configure_apex(amp, model, optimizers, self.trainer.amp_level)
self.trainer.reinit_scheduler_properties(optimizers, self.trainer.lr_schedulers)
return model, optimizers
def training_step(self, fx, args):
output = fx(args)
return output
def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
closure_loss = amp.scale_loss(closure_loss, optimizer)
# enter apex context
self.trainer.dev_debugger.track_event('AMP', str(AMPType.APEX))
context = closure_loss
closure_loss = closure_loss.__enter__()
# do backward pass
if self.trainer.train_loop.automatic_optimization:
model = self.trainer.get_model()
model.backward(closure_loss, optimizer, opt_idx)
else:
closure_loss.backward(*args, **kwargs)
# exit amp context
a, b, c = None, None, None
error = context.__exit__(a, b, c)
if error:
rank_zero_warn(a, b, c)
raise Exception('apex unscale error')
# once backward has been applied, release graph
closure_loss = closure_loss.detach()
return closure_loss
def configure_apex(
self,
amp: object,
2020-10-19 20:20:17 +00:00
model: LightningModule,
optimizers: List[Optimizer],
amp_level: str,
2020-10-19 20:20:17 +00:00
) -> Tuple[LightningModule, List[Optimizer]]:
r"""
Override to init AMP your own way.
Must return a model and list of optimizers.
Args:
amp: pointer to amp library object.
model: pointer to current :class:`LightningModule`.
optimizers: list of optimizers passed in :meth:`configure_optimizers`.
amp_level: AMP mode chosen ('O1', 'O2', etc...)
Return:
Apex wrapped model and optimizers
Examples:
.. code-block:: python
# Default implementation used by Trainer.
def configure_apex(self, amp, model, optimizers, amp_level):
model, optimizers = amp.initialize(
model, optimizers, opt_level=amp_level,
)
return model, optimizers
"""
model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
return model, optimizers
def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
"""
This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights.
This is important when setting amp_level to O2, and the master weights are in fp16.
Args:
grad_clip_val: Maximum norm of gradients.
optimizer: Optimizer with gradients that will be clipped.
norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.
"""
model = self.trainer.get_model()
parameters = model.parameters()
max_norm = float(grad_clip_val)
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = [p for p in parameters if p.grad is not None]
if len(parameters) == 0:
return torch.tensor(0.)
device = parameters[0].grad.device
total_norm = torch.norm(
torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon)
if clip_coef < 1:
for p in parameters:
p.grad.detach().mul_(clip_coef.to(p.grad.device))
@property
def norm_clipping_epsilon(self):
return 1e-5
optimizer clean up (#4658) * add LightningOptimizer * typo * add mock closure * typo * remove logic in optimizer_step * update * update * update * desactivate LightningOptimizer for hovorod * resolve flake * typo * check optimizer name * change name * added backward to LightningOptimizer * remove use_lightning_optimizer * move update * simplify init * resolve comments * resolve bug * update * update * resolve bugs * resolve flake8 * set state * work manual_optimizer_step * add doc * add enable_pl_optimizer * make optimizer_step * add make_optimizer_step * add examples * resolve test * add test_optimizer_return_options_enable_pl_optimizer * add enable_pl_optimizer=True * update * update tests * resolve bugs * update * set Trainer to False * update * resolve bugs * update * remove from doc * resolve bug * typo * update * set to True * simplification * typo * resolve horovod * unwrap horovod * remove Optimizer * resolve horovod * move logic to amp_backend * doesn't seem to be pickable * update * add again * resolve some bugs * cleanup * resolve bug with AMP * change __repr__ * round at -12 * udpate * update * update * remove from horovod * typo * add convert_to_lightning_optimizers in each accelerators * typo * forgot * forgot a convert_to_lightning_optimizers * update * update * update * increase coverage * update * resolve flake8 * update * remove useless code * resolve comments + add support for LightningOptimizer base class * resolve flake * check optimizer get wrapped back * resolve DDPSharded * reduce code * lightningoptimizer * Update pytorch_lightning/core/optimizer.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update pytorch_lightning/core/lightning.py * remove reference to step function * Apply suggestions from code review * update on comments * resolve * Update CHANGELOG.md * add back training_step in apex and native_amp * rename optimizer_step Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: William Falcon <waf2107@columbia.edu> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
2020-12-01 00:09:46 +00:00
def optimizer_step(self, trainer, optimizer, closure):
# apex amp does not yet support closures.
# TODO: pass the closure to the step ASAP
with trainer.profiler.profile("closure"):
closure()
if not self.trainer.train_loop.automatic_optimization:
trainer.call_hook("on_after_backward")
optimizer clean up (#4658) * add LightningOptimizer * typo * add mock closure * typo * remove logic in optimizer_step * update * update * update * desactivate LightningOptimizer for hovorod * resolve flake * typo * check optimizer name * change name * added backward to LightningOptimizer * remove use_lightning_optimizer * move update * simplify init * resolve comments * resolve bug * update * update * resolve bugs * resolve flake8 * set state * work manual_optimizer_step * add doc * add enable_pl_optimizer * make optimizer_step * add make_optimizer_step * add examples * resolve test * add test_optimizer_return_options_enable_pl_optimizer * add enable_pl_optimizer=True * update * update tests * resolve bugs * update * set Trainer to False * update * resolve bugs * update * remove from doc * resolve bug * typo * update * set to True * simplification * typo * resolve horovod * unwrap horovod * remove Optimizer * resolve horovod * move logic to amp_backend * doesn't seem to be pickable * update * add again * resolve some bugs * cleanup * resolve bug with AMP * change __repr__ * round at -12 * udpate * update * update * remove from horovod * typo * add convert_to_lightning_optimizers in each accelerators * typo * forgot * forgot a convert_to_lightning_optimizers * update * update * update * increase coverage * update * resolve flake8 * update * remove useless code * resolve comments + add support for LightningOptimizer base class * resolve flake * check optimizer get wrapped back * resolve DDPSharded * reduce code * lightningoptimizer * Update pytorch_lightning/core/optimizer.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update pytorch_lightning/core/lightning.py * remove reference to step function * Apply suggestions from code review * update on comments * resolve * Update CHANGELOG.md * add back training_step in apex and native_amp * rename optimizer_step Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: William Falcon <waf2107@columbia.edu> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
2020-12-01 00:09:46 +00:00
with trainer.profiler.profile("optimizer_step"):
optimizer.step()