lightning/pytorch_lightning/accelerators/accelerator.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
from enum import Enum
from typing import Any, Optional

import torch

from pytorch_lightning.utilities import AMPType, rank_zero_warn
from pytorch_lightning.utilities.apply_func import move_data_to_device
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.parsing import AttributeDict
import torch.distributed as torch_distrib
from pytorch_lightning import _logger as log

try:
    from apex import amp
except ImportError:
    amp = None

EPSILON = 1e-6
EPSILON_FP16 = 1e-5


class Accelerator(object):

    def __init__(self, trainer=None, cluster_environment=None):
        self.trainer = trainer
        self.nickname = None
        self.cluster_environment = cluster_environment
        self.dist = AttributeDict(rank=0, device=None)

        if trainer is not None:
            self.train_loop = self.trainer.train
            self.validation_loop = self.trainer.run_evaluation
            self.test_loop = self.trainer.run_evaluation

    def setup(self, model):
        pass

    def teardown(self):
        pass

    def barrier(self, name: Optional[str] = None):
        pass

    def broadcast(self, obj, src=0):
        return obj

    def train_or_test(self):
        if self.trainer.testing:
            results = self.trainer.run_test()
        else:
            results = self.trainer.train()
        return results

    def batch_to_device(self, batch: Any, device: torch.device):
        model = self.trainer.get_model()
        if model is not None:
            return model.transfer_batch_to_device(batch, device)
        return move_data_to_device(batch, device)

    def training_step_end(self, output):
        return output

    def test_step_end(self, output):
        return output

    def validation_step_end(self, output):
        return output

    def process_dataloader(self, dataloader):
        return dataloader

    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
        if self.trainer.precision == 16:
            closure_loss = self.trainer.precision_connector.backend.backward(
                closure_loss, optimizer, opt_idx, *args, **kwargs
            )
        else:
            # do backward pass
            if self.trainer.train_loop.automatic_optimization:
                model = self.trainer.get_model()
                model.backward(closure_loss, optimizer, opt_idx)
            else:
                closure_loss.backward(*args, **kwargs)

            # once backward has been applied, release graph
            closure_loss = closure_loss.detach()
        return closure_loss

    def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
        model_ref = self.trainer.get_model()
        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
        native_amp = self.trainer.amp_backend == AMPType.NATIVE

        # native amp + lbfgs is a no go right now
        if native_amp and is_lbfgs:
            raise MisconfigurationException(
                'native PyTorch amp and lbfgs are not compatible.'
                ' To request, please file a Github issue in PyTorch and tag @mcarilli')

        # model hook
        model_ref.optimizer_step(
            self.trainer.current_epoch,
            batch_idx,
            optimizer,
            opt_idx,
            lambda_closure,
            using_native_amp=native_amp,
            using_lbfgs=is_lbfgs
        )

        # scale when native amp
        if native_amp:
            self.trainer.scaler.update()

    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
        model_ref = self.trainer.get_model()
        model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)

    def clip_gradients(self, optimizer, clip_val=None):

        if self.trainer.amp_backend == AMPType.NATIVE:
            self.trainer.scaler.unscale_(optimizer)

        # apply clip gradients
        # TODO: separate TPU case from here
        self._clip_gradients(optimizer, clip_val)

    def _clip_gradients(self, optimizer, clip_val=None):
        # use the trainer's clip val if none passed
        grad_clip_val = self.trainer.gradient_clip_val
        if clip_val is not None:
            grad_clip_val = clip_val
        grad_clip_val = float(grad_clip_val)

        # this code is a modification of torch.nn.utils.clip_grad_norm_
        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
        if grad_clip_val <= 0:
            return

        model = self.trainer.get_model()
        if self.trainer.amp_backend == AMPType.APEX:
            parameters = amp.master_params(optimizer)
        else:
            parameters = model.parameters()

        max_norm = grad_clip_val
        norm_type = float(2.0)

        if isinstance(parameters, torch.Tensor):
            parameters = [parameters]
        parameters = list(filter(lambda p: p.grad is not None, parameters))

        if norm_type == math.inf:
            total_norm = max(p.grad.data.abs().max() for p in parameters)
        else:
            device = parameters[0].device
            out = torch.empty(len(parameters), device=device)
            for i, p in enumerate(parameters):
                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
            total_norm = torch.norm(out, norm_type)

        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
        for p in parameters:
            p.grad.data.mul_(clip_coef.to(p.grad.data.device))

    def on_train_epoch_end(self, outputs):
        pass

    def on_train_end(self):
        pass

    def early_stopping_should_stop(self, pl_module):
        return self.trainer.should_stop

    def setup_optimizers(self, model):
        if self.trainer.testing is True:
            return

        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

    def init_ddp_connection(
        self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
    ) -> None:
        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"

        if not torch.distributed.is_initialized():
            log.info(
                f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"
            )
            torch_distrib.init_process_group(
                torch_backend, rank=global_rank, world_size=world_size
            )

    def __getstate__(self):
        return {
            'trainer': self.trainer,
            'nickname': self.nickname,
            'cluster_environment': self.cluster_environment,
            'dist': self.dist
        }

    def __setstate__(self, d):
        self.trainer = d['trainer']
        self.nickname = d['nickname']
        self.cluster_environment = d['cluster_environment']
        self.dist = d['dist']


# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...
class BackendType(Enum):
    DP = 'dp'
    DDP = 'ddp'
    DDP2 = 'ddp2'
    DDP_SPAWN = 'ddp_spawn'
    # decuple distrib and device
    DDP_CPU = 'ddp_cpu'
    HOROVOD = 'horovod'
    # this is rather device
    TPU = 'tpu'
notices (#4118) 2020-10-13 11:18:07 +00:00			`# Copyright The PyTorch Lightning team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`import os`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00			`import math`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`from enum import Enum`
Use `Optional` for arguments set to `None` by default (#4164) * Use `Optional` for variables set to `None` by default * Use `Optional` instead of `Union[None, ...]` for consistency 2020-10-15 21:02:50 +00:00			`from typing import Any, Optional`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00
			`import torch`

ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`from pytorch_lightning.utilities import AMPType, rank_zero_warn`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00			`from pytorch_lightning.utilities.apply_func import move_data_to_device`
ref: inner train loop (intermediate step) 3/n (#3363) 2020-09-05 21:01:46 +00:00			`from pytorch_lightning.utilities.exceptions import MisconfigurationException`
add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`from pytorch_lightning.utilities.parsing import AttributeDict`
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`import torch.distributed as torch_distrib`
			`from pytorch_lightning import _logger as log`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
			`try:`
			`from apex import amp`
			`except ImportError:`
			`amp = None`

			`EPSILON = 1e-6`
			`EPSILON_FP16 = 1e-5`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00

			`class Accelerator(object):`

enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00			`def __init__(self, trainer=None, cluster_environment=None):`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00			`self.trainer = trainer`
enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00			`self.nickname = None`
ref: adding compute environments (2/n) (#3842) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) * ref: adding compute environments (2/n) 2020-10-04 12:48:46 +00:00			`self.cluster_environment = cluster_environment`
add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`self.dist = AttributeDict(rank=0, device=None)`
enable passing in custom accelerators (#4050) * enable custom accelerators * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward * ref: finish decoupling apex, LM and backward 2020-10-10 13:21:08 +00:00
			`if trainer is not None:`
			`self.train_loop = self.trainer.train`
			`self.validation_loop = self.trainer.run_evaluation`
			`self.test_loop = self.trainer.run_evaluation`
Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00
ddp backend refactor (#3207) 2020-08-26 23:10:24 +00:00			`def setup(self, model):`
ddp backend refactor (#3204) 2020-08-26 22:43:28 +00:00			`pass`

acceleartor fit 1 (#3200) 2020-08-26 18:20:38 +00:00			`def teardown(self):`
			`pass`

Use `Optional` for arguments set to `None` by default (#4164) * Use `Optional` for variables set to `None` by default * Use `Optional` instead of `Union[None, ...]` for consistency 2020-10-15 21:02:50 +00:00			`def barrier(self, name: Optional[str] = None):`
ref: organize args 4/n (#3456) 2020-09-11 01:58:47 +00:00			`pass`

add dist lib to enable syncing anything across devices (#3762) * add dist lib to enable syncing anything across devices 2020-10-01 05:21:38 +00:00			`def broadcast(self, obj, src=0):`
			`return obj`

ref: organize args 4/n (#3456) 2020-09-11 01:58:47 +00:00			`def train_or_test(self):`
			`if self.trainer.testing:`
			`results = self.trainer.run_test()`
			`else:`
			`results = self.trainer.train()`
			`return results`

Refactor 1: moved tpu xxx_step to backend (#3118) * moved tpu training_step * refactored eval step * refactored eval step * refactored eval step 2020-08-24 11:02:06 +00:00			`def batch_to_device(self, batch: Any, device: torch.device):`
			`model = self.trainer.get_model()`
			`if model is not None:`
			`return model.transfer_batch_to_device(batch, device)`
			`return move_data_to_device(batch, device)`
ref: moved ___step_end hooks (#3130) * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks * moved eval hooks 2020-08-24 21:50:47 +00:00
			`def training_step_end(self, output):`
			`return output`

			`def test_step_end(self, output):`
			`return output`

			`def validation_step_end(self, output):`
			`return output`
refactored dataloader process hook (#3139) 2020-08-25 01:53:56 +00:00
			`def process_dataloader(self, dataloader):`
			`return dataloader`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00
ref: decouple apex second attemp part 10/n (#4064) * ref: decouple apex second attemp part 9/n * ref: decouple apex second attemp part 9/n * ref: decouple apex second attemp part 9/n 2020-10-11 00:05:05 +00:00			`def backward(self, closure_loss, optimizer, opt_idx, args, *kwargs):`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`if self.trainer.precision == 16:`
ref: decouple apex second attemp part 10/n (#4064) * ref: decouple apex second attemp part 9/n * ref: decouple apex second attemp part 9/n * ref: decouple apex second attemp part 9/n 2020-10-11 00:05:05 +00:00			`closure_loss = self.trainer.precision_connector.backend.backward(`
			`closure_loss, optimizer, opt_idx, args, *kwargs`
			`)`
ref: decouple apex second attemp part 3/n (#4055) 2020-10-10 15:05:57 +00:00			`else:`
			`# do backward pass`
ref: decouple apex second attemp part n/n (#4065) * ref: decouple apex second attemp part n/n * ref: decouple apex second attemp part n/n 2020-10-11 02:04:50 +00:00			`if self.trainer.train_loop.automatic_optimization:`
			`model = self.trainer.get_model()`
			`model.backward(closure_loss, optimizer, opt_idx)`
			`else:`
			`closure_loss.backward(args, *kwargs)`
ref: decouple apex second attemp part 3/n (#4055) 2020-10-10 15:05:57 +00:00
			`# once backward has been applied, release graph`
			`closure_loss = closure_loss.detach()`
ref: inner train loop (intermediate step) 1/n (#3359) 2020-09-05 12:55:22 +00:00			`return closure_loss`
ref: inner train loop (intermediate step) 3/n (#3363) 2020-09-05 21:01:46 +00:00
			`def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):`
			`model_ref = self.trainer.get_model()`
			`is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)`
			`native_amp = self.trainer.amp_backend == AMPType.NATIVE`

			`# native amp + lbfgs is a no go right now`
			`if native_amp and is_lbfgs:`
			`raise MisconfigurationException(`
			`'native PyTorch amp and lbfgs are not compatible.'`
			`' To request, please file a Github issue in PyTorch and tag @mcarilli')`

			`# model hook`
			`model_ref.optimizer_step(`
			`self.trainer.current_epoch,`
			`batch_idx,`
			`optimizer,`
			`opt_idx,`
			`lambda_closure,`
			`using_native_amp=native_amp,`
			`using_lbfgs=is_lbfgs`
			`)`

			`# scale when native amp`
			`if native_amp:`
			`self.trainer.scaler.update()`

			`def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):`
			`model_ref = self.trainer.get_model()`
			`model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
clean docs, enable grad clip in manual mode (#4078) * docs * docs 2020-10-11 17:12:35 +00:00			`def clip_gradients(self, optimizer, clip_val=None):`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
			`if self.trainer.amp_backend == AMPType.NATIVE:`
			`self.trainer.scaler.unscale_(optimizer)`

			`# apply clip gradients`
			`# TODO: separate TPU case from here`
clean docs, enable grad clip in manual mode (#4078) * docs * docs 2020-10-11 17:12:35 +00:00			`self._clip_gradients(optimizer, clip_val)`

			`def _clip_gradients(self, optimizer, clip_val=None):`
			`# use the trainer's clip val if none passed`
			`grad_clip_val = self.trainer.gradient_clip_val`
			`if clip_val is not None:`
			`grad_clip_val = clip_val`
			`grad_clip_val = float(grad_clip_val)`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00
			`# this code is a modification of torch.nn.utils.clip_grad_norm_`
			`# with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md`
clean docs, enable grad clip in manual mode (#4078) * docs * docs 2020-10-11 17:12:35 +00:00			`if grad_clip_val <= 0:`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00			`return`

			`model = self.trainer.get_model()`
			`if self.trainer.amp_backend == AMPType.APEX:`
			`parameters = amp.master_params(optimizer)`
			`else:`
			`parameters = model.parameters()`

clean docs, enable grad clip in manual mode (#4078) * docs * docs 2020-10-11 17:12:35 +00:00			`max_norm = grad_clip_val`
ref: inner train loop (intermediate step) 5/n (#3365) 2020-09-05 22:27:28 +00:00			`norm_type = float(2.0)`

			`if isinstance(parameters, torch.Tensor):`
			`parameters = [parameters]`
			`parameters = list(filter(lambda p: p.grad is not None, parameters))`

			`if norm_type == math.inf:`
			`total_norm = max(p.grad.data.abs().max() for p in parameters)`
			`else:`
			`device = parameters[0].device`
			`out = torch.empty(len(parameters), device=device)`
			`for i, p in enumerate(parameters):`
			`torch.norm(p.grad.data.to(device), norm_type, out=out[i])`
			`total_norm = torch.norm(out, norm_type)`

			`eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON`
			`clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)`
			`clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))`
			`for p in parameters:`
			`p.grad.data.mul_(clip_coef.to(p.grad.data.device))`
ref: inner train loop (intermediate step) 12/n (#3372) * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n 2020-09-06 21:50:47 +00:00
added tests for the training epoch end (#3967) 2020-10-08 02:27:36 +00:00			`def on_train_epoch_end(self, outputs):`
ref: inner train loop (intermediate step) 12/n (#3372) * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n * ref: inner train loop (intermediate step) 12/n 2020-09-06 21:50:47 +00:00			`pass`
ref: move specific accelerator code x/n (#3457) * ref: organize args x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n 2020-09-11 14:56:21 +00:00
ref: part a of #3733 (#3766) * ref: part a of #3733 * ref: part a of #3733 2020-10-01 12:15:23 +00:00			`def on_train_end(self):`
			`pass`

ref: move specific accelerator code x/n (#3457) * ref: organize args x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n * ref: move specific accelerator code x/n 2020-09-11 14:56:21 +00:00			`def early_stopping_should_stop(self, pl_module):`
			`return self.trainer.should_stop`
disable optimizers setup during testing (#3059) * disable configure_optimizers during testing * minor changes * hvd and ddp * fix precision during testing * fix ddp * fix amp * fix cpu * update dp * simplify optimizers * add test * codefactor * ref optimizer setup * chlog * suggestions * isort * rebased with master 2020-09-28 23:09:04 +00:00
			`def setup_optimizers(self, model):`
			`if self.trainer.testing is True:`
			`return`

			`optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)`
			`self.trainer.optimizers = optimizers`
			`self.trainer.lr_schedulers = lr_schedulers`
			`self.trainer.optimizer_frequencies = optimizer_frequencies`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`def init_ddp_connection(`
			`self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True`
			`) -> None:`
ref: enable custom clusters (1/n) (#4048) * enable cluster plugins * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices * enable cluster plugins + test backend choices 2020-10-10 12:09:29 +00:00			`os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())`
			`os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())`
			`os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())`
ref: callback system and init ddp (1/n) (#3836) * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp * refactored callback system and init ddp 2020-10-04 03:39:17 +00:00			`torch_backend = "nccl" if self.trainer.on_gpu else "gloo"`

			`if not torch.distributed.is_initialized():`
			`log.info(`
			`f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"`
			`)`
			`torch_distrib.init_process_group(`
			`torch_backend, rank=global_rank, world_size=world_size`
			`)`

Added getstate/setstate method for torch.save serialization (#4127) * Added getstate/setstate method for torch.save serialization, added additional Optional Typing to results object * Added tests to ensure torch.save does not fail * Added flags to ensure compatible ddp cpu environment * Removed torch version check due to minimum already being 1.3, reduced epochs for speed * Moved tests to separate file * Update to accelerator, move to ddp_spawn to prevent hanging ddp 2020-10-13 20:47:23 +00:00			`def __getstate__(self):`
			`return {`
			`'trainer': self.trainer,`
			`'nickname': self.nickname,`
			`'cluster_environment': self.cluster_environment,`
			`'dist': self.dist`
			`}`

			`def __setstate__(self, d):`
			`self.trainer = d['trainer']`
			`self.nickname = d['nickname']`
			`self.cluster_environment = d['cluster_environment']`
			`self.dist = d['dist']`

define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`class BackendType(Enum):`
			`DP = 'dp'`
			`DDP = 'ddp'`
			`DDP2 = 'ddp2'`
			`DDP_SPAWN = 'ddp_spawn'`
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# decuple distrib and device`
define distributed as a type (#3740) * define type * miss * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * miss * warn Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-09-30 12:33:01 +00:00			`DDP_CPU = 'ddp_cpu'`
			`HOROVOD = 'horovod'`
revert backend types (#3788) * revert backend types * todo * todo 2020-10-02 10:18:44 +00:00			`# this is rather device`
			`TPU = 'tpu'`