lightning/pytorch_lightning/accelerators/base_accelerator.py

import os
import math
from enum import Enum
from typing import Any

import torch

from pytorch_lightning.utilities import AMPType, rank_zero_warn
from pytorch_lightning.utilities.apply_func import move_data_to_device
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.parsing import AttributeDict
import torch.distributed as torch_distrib
from pytorch_lightning import _logger as log

try:
    from apex import amp
except ImportError:
    amp = None

EPSILON = 1e-6
EPSILON_FP16 = 1e-5


class Accelerator(object):

    def __init__(self, trainer=None, cluster_environment=None):
        self.trainer = trainer
        self.nickname = None
        self.cluster_environment = cluster_environment
        self.dist = AttributeDict(rank=0, device=None)

        if trainer is not None:
            self.train_loop = self.trainer.train
            self.validation_loop = self.trainer.run_evaluation
            self.test_loop = self.trainer.run_evaluation

    def setup(self, model):
        pass

    def teardown(self):
        pass

    def barrier(self, name: str = None):
        pass

    def broadcast(self, obj, src=0):
        return obj

    def train_or_test(self):
        if self.trainer.testing:
            results = self.trainer.run_test()
        else:
            results = self.trainer.train()
        return results

    def batch_to_device(self, batch: Any, device: torch.device):
        model = self.trainer.get_model()
        if model is not None:
            return model.transfer_batch_to_device(batch, device)
        return move_data_to_device(batch, device)

    def training_step_end(self, output):
        return output

    def test_step_end(self, output):
        return output

    def validation_step_end(self, output):
        return output

    def process_dataloader(self, dataloader):
        return dataloader

    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
        if self.trainer.precision == 16:
            closure_loss = self.trainer.precision_connector.backend.backward(closure_loss, optimizer, *args, **kwargs)
        else:
            # do backward pass
            closure_loss.backward(*args, **kwargs)

            # once backward has been applied, release graph
            closure_loss = closure_loss.detach()
        return closure_loss

    def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
        model_ref = self.trainer.get_model()
        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
        native_amp = self.trainer.amp_backend == AMPType.NATIVE

        # native amp + lbfgs is a no go right now
        if native_amp and is_lbfgs:
            raise MisconfigurationException(
                'native PyTorch amp and lbfgs are not compatible.'
                ' To request, please file a Github issue in PyTorch and tag @mcarilli')

        # model hook
        model_ref.optimizer_step(
            self.trainer.current_epoch,
            batch_idx,
            optimizer,
            opt_idx,
            lambda_closure,
            using_native_amp=native_amp,
            using_lbfgs=is_lbfgs
        )

        # scale when native amp
        if native_amp:
            self.trainer.scaler.update()

    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
        model_ref = self.trainer.get_model()
        model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)

    def clip_gradients(self, optimizer):

        if self.trainer.amp_backend == AMPType.NATIVE:
            self.trainer.scaler.unscale_(optimizer)

        # apply clip gradients
        # TODO: separate TPU case from here
        self._clip_gradients(optimizer)

    def _clip_gradients(self, optimizer):
        # this code is a modification of torch.nn.utils.clip_grad_norm_
        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
        if self.trainer.gradient_clip_val <= 0:
            return

        model = self.trainer.get_model()
        if self.trainer.amp_backend == AMPType.APEX:
            parameters = amp.master_params(optimizer)
        else:
            parameters = model.parameters()

        max_norm = float(self.trainer.gradient_clip_val)
        norm_type = float(2.0)

        if isinstance(parameters, torch.Tensor):
            parameters = [parameters]
        parameters = list(filter(lambda p: p.grad is not None, parameters))

        if norm_type == math.inf:
            total_norm = max(p.grad.data.abs().max() for p in parameters)
        else:
            device = parameters[0].device
            out = torch.empty(len(parameters), device=device)
            for i, p in enumerate(parameters):
                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
            total_norm = torch.norm(out, norm_type)

        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
        for p in parameters:
            p.grad.data.mul_(clip_coef.to(p.grad.data.device))

    def on_train_epoch_end(self, outputs):
        pass

    def on_train_end(self):
        pass

    def early_stopping_should_stop(self, pl_module):
        return self.trainer.should_stop

    def setup_optimizers(self, model):
        if self.trainer.testing is True:
            return

        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

    def init_ddp_connection(
        self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
    ) -> None:
        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"

        if not torch.distributed.is_initialized():
            log.info(
                f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"
            )
            torch_distrib.init_process_group(
                torch_backend, rank=global_rank, world_size=world_size
            )


# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...
class BackendType(Enum):
    DP = 'dp'
    DDP = 'ddp'
    DDP2 = 'ddp2'
    DDP_SPAWN = 'ddp_spawn'
    # decuple distrib and device
    DDP_CPU = 'ddp_cpu'
    HOROVOD = 'horovod'
    # this is rather device
    TPU = 'tpu'
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`import os`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00			`import math`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`from enum import Enum`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00			`from typing import Any`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00
			`import torch`

ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`from pytorch_lightning.utilities import AMPType, rank_zero_warn`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00			`from pytorch_lightning.utilities.apply_func import move_data_to_device`
ref: inner train loop (intermediate step) 3/n (#3363) 2020-09-05 21:01:46 +00:00			`from pytorch_lightning.utilities.exceptions import MisconfigurationException`
add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`from pytorch_lightning.utilities.parsing import AttributeDict`
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`import torch.distributed as torch_distrib`
			`from pytorch_lightning import _logger as log`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
			`try:`
			`from apex import amp`
			`except ImportError:`
			`amp = None`

			`EPSILON = 1e-6`
			`EPSILON_FP16 = 1e-5`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00

			`class Accelerator(object):`

enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00			`def __init__(self, trainer=None, cluster_environment=None):`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00			`self.trainer = trainer`
enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00			`self.nickname = None`
ref: adding compute environments (2/n) (#3842) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) 2020-10-04 12:48:46 +00:00			`self.cluster_environment = cluster_environment`
add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`self.dist = AttributeDict(rank=0, device=None)`
enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00
			`if trainer is not None:`
			`self.train_loop = self.trainer.train`
			`self.validation_loop = self.trainer.run_evaluation`
			`self.test_loop = self.trainer.run_evaluation`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00
ddp backend refactor (#3207) 2020-08-26 23:10:24 +00:00			`def setup(self, model):`
ddp backend refactor (#3204) 2020-08-26 22:43:28 +00:00			`pass`

acceleartor fit 1 (#3200) 2020-08-26 18:20:38 +00:00			`def teardown(self):`
			`pass`

ref: organize args 4/n (#3456) 2020-09-11 01:58:47 +00:00			`def barrier(self, name: str = None):`
			`pass`

add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`def broadcast(self, obj, src=0):`
			`return obj`

ref: organize args 4/n (#3456) 2020-09-11 01:58:47 +00:00			`def train_or_test(self):`
			`if self.trainer.testing:`
			`results = self.trainer.run_test()`
			`else:`
			`results = self.trainer.train()`
			`return results`

Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00			`def batch_to_device(self, batch: Any, device: torch.device):`
			`model = self.trainer.get_model()`
			`if model is not None:`
			`return model.transfer_batch_to_device(batch, device)`
			`return move_data_to_device(batch, device)`
ref: moved ___step_end hooks (#3130) * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks 2020-08-24 21:50:47 +00:00
			`def training_step_end(self, output):`
			`return output`

			`def test_step_end(self, output):`
			`return output`

			`def validation_step_end(self, output):`
			`return output`
refactored dataloader process hook (#3139) 2020-08-25 01:53:56 +00:00
			`def process_dataloader(self, dataloader):`
			`return dataloader`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00
ref: decouple apex second attemp part 2/n (#4054) * ref: decouple apex second attemp part 2/n * ref: decouple apex second attemp part 2/n 2020-10-10 14:24:20 +00:00			`def backward(self, closure_loss, optimizer, opt_idx, args, *kwargs):`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`if self.trainer.precision == 16:`
ref: decouple apex second attemp part 3/n (#4055) 2020-10-10 15:05:57 +00:00			`closure_loss = self.trainer.precision_connector.backend.backward(closure_loss, optimizer, args, *kwargs)`
			`else:`
			`# do backward pass`
			`closure_loss.backward(args, *kwargs)`

			`# once backward has been applied, release graph`
			`closure_loss = closure_loss.detach()`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`return closure_loss`
ref: inner train loop (intermediate step) 3/n (#3363) 2020-09-05 21:01:46 +00:00
			`def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):`
			`model_ref = self.trainer.get_model()`
			`is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)`
			`native_amp = self.trainer.amp_backend == AMPType.NATIVE`

			`# native amp + lbfgs is a no go right now`
			`if native_amp and is_lbfgs:`
			`raise MisconfigurationException(`
			`'native PyTorch amp and lbfgs are not compatible.'`
			`' To request, please file a Github issue in PyTorch and tag @mcarilli')`

			`# model hook`
			`model_ref.optimizer_step(`
			`self.trainer.current_epoch,`
			`batch_idx,`
			`optimizer,`
			`opt_idx,`
			`lambda_closure,`
			`using_native_amp=native_amp,`
			`using_lbfgs=is_lbfgs`
			`)`

			`# scale when native amp`
			`if native_amp:`
			`self.trainer.scaler.update()`

			`def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):`
			`model_ref = self.trainer.get_model()`
			`model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
			`def clip_gradients(self, optimizer):`

			`if self.trainer.amp_backend == AMPType.NATIVE:`
			`self.trainer.scaler.unscale_(optimizer)`

			`# apply clip gradients`
			`# TODO: separate TPU case from here`
			`self._clip_gradients(optimizer)`

			`def _clip_gradients(self, optimizer):`
			`# this code is a modification of torch.nn.utils.clip_grad_norm_`
			`# with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md`
			`if self.trainer.gradient_clip_val <= 0:`
			`return`

			`model = self.trainer.get_model()`
			`if self.trainer.amp_backend == AMPType.APEX:`
			`parameters = amp.master_params(optimizer)`
			`else:`
			`parameters = model.parameters()`

			`max_norm = float(self.trainer.gradient_clip_val)`
			`norm_type = float(2.0)`

			`if isinstance(parameters, torch.Tensor):`
			`parameters = [parameters]`
			`parameters = list(filter(lambda p: p.grad is not None, parameters))`

			`if norm_type == math.inf:`
			`total_norm = max(p.grad.data.abs().max() for p in parameters)`
			`else:`
			`device = parameters[0].device`
			`out = torch.empty(len(parameters), device=device)`
			`for i, p in enumerate(parameters):`
			`torch.norm(p.grad.data.to(device), norm_type, out=out[i])`
			`total_norm = torch.norm(out, norm_type)`

			`eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON`
			`clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)`
			`clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))`
			`for p in parameters:`
			`p.grad.data.mul_(clip_coef.to(p.grad.data.device))`
ref: inner train loop (intermediate step) 12/n (#3372) * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n 2020-09-06 21:50:47 +00:00
added tests for the training epoch end (#3967) 2020-10-08 02:27:36 +00:00			`def on_train_epoch_end(self, outputs):`
ref: inner train loop (intermediate step) 12/n (#3372) * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n 2020-09-06 21:50:47 +00:00			`pass`
ref: move specific accelerator code x/n (#3457) * ref: organize args x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n 2020-09-11 14:56:21 +00:00
ref: part a of #3733 (#3766) * ref: part a of #3733 * ref: part a of #3733 2020-10-01 12:15:23 +00:00			`def on_train_end(self):`
			`pass`

ref: move specific accelerator code x/n (#3457) * ref: organize args x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n 2020-09-11 14:56:21 +00:00			`def early_stopping_should_stop(self, pl_module):`
			`return self.trainer.should_stop`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00
			`def setup_optimizers(self, model):`
			`if self.trainer.testing is True:`
			`return`

			`optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)`
			`self.trainer.optimizers = optimizers`
			`self.trainer.lr_schedulers = lr_schedulers`
			`self.trainer.optimizer_frequencies = optimizer_frequencies`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`def init_ddp_connection(`
			`self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True`
			`) -> None:`
ref: enable custom clusters (1/n) (#4048) * enable cluster plugins * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices 2020-10-10 12:09:29 +00:00			`os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())`
			`os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())`
			`os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())`
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`torch_backend = "nccl" if self.trainer.on_gpu else "gloo"`

			`if not torch.distributed.is_initialized():`
			`log.info(`
			`f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"`
			`)`
			`torch_distrib.init_process_group(`
			`torch_backend, rank=global_rank, world_size=world_size`
			`)`

define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`class BackendType(Enum):`
			`DP = 'dp'`
			`DDP = 'ddp'`
			`DDP2 = 'ddp2'`
			`DDP_SPAWN = 'ddp_spawn'`
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# decuple distrib and device`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`DDP_CPU = 'ddp_cpu'`
			`HOROVOD = 'horovod'`
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# this is rather device`
			`TPU = 'tpu'`