From 2c3d43dcb503cf2f8fc8a4b30d4ab6f986f348f1 Mon Sep 17 00:00:00 2001 From: chaton Date: Thu, 10 Dec 2020 15:24:44 +0100 Subject: [PATCH] Initialize trainer with None in DDPAccelerator (#4915) * Initialize trainer with None * add typing to all accelerators * resolve imports * update * add typing * removed typo * update * Fix formatting and imports in accelerator Co-authored-by: maxjeblick Co-authored-by: Sean Naren Co-authored-by: SeanNaren Co-authored-by: Roger Shieh --- pytorch_lightning/accelerators/accelerator.py | 8 ++++++-- pytorch_lightning/accelerators/cpu_accelerator.py | 3 ++- .../accelerators/ddp2_accelerator.py | 9 +++++++-- pytorch_lightning/accelerators/ddp_accelerator.py | 15 ++++++++++++--- .../accelerators/ddp_cpu_hpc_accelerator.py | 12 ++++++++++-- .../accelerators/ddp_cpu_spawn_accelerator.py | 15 ++++++++++----- .../accelerators/ddp_hpc_accelerator.py | 9 +++++++-- .../accelerators/ddp_spawn_accelerator.py | 15 ++++++++++----- pytorch_lightning/accelerators/dp_accelerator.py | 6 ++++-- pytorch_lightning/accelerators/gpu_accelerator.py | 4 +++- .../accelerators/horovod_accelerator.py | 4 +++- pytorch_lightning/accelerators/tpu_accelerator.py | 3 ++- 12 files changed, 76 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 01b4d1abe5..77f30219ba 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from enum import Enum from typing import Any, Optional, Union import torch import torch.distributed as torch_distrib from torch.optim import Optimizer +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.parsing import AttributeDict @@ -33,7 +34,10 @@ else: class Accelerator(object): - def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer: Optional = None, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): self.trainer = trainer self.nickname = None self.cluster_environment = cluster_environment diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 2b290c5226..25302cabbc 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -16,13 +16,14 @@ from typing import Any, Optional, Union, Callable import torch from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import AMPType, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException class CPUAccelerator(Accelerator): - def __init__(self, trainer, cluster_environment=None): + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): """ Runs training on CPU diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 2e3e39a5cd..2e864029f8 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -20,12 +20,14 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import Result from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available, all_gather_ddp_if_available +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available if HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig @@ -34,7 +36,10 @@ if HYDRA_AVAILABLE: class DDP2Accelerator(Accelerator): - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP2 strategy on a cluster diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index 9789247ac2..da9eb2d3ea 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -25,12 +25,18 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available -from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import ( + all_gather_ddp_if_available, + find_free_network_port, + rank_zero_only, + sync_ddp_if_available, +) from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything @@ -41,7 +47,10 @@ if HYDRA_AVAILABLE: class DDPAccelerator(Accelerator): - def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer: Optional = None, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP strategy on a single machine (manually, not via cluster start) diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index ed94a8fdae..a0545a4604 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -11,17 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +from typing import Optional + +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE if HYDRA_AVAILABLE: - from hydra.utils import to_absolute_path, get_original_cwd from hydra.core.hydra_config import HydraConfig + from hydra.utils import get_original_cwd, to_absolute_path class DDPCPUHPCAccelerator(DDPHPCAccelerator): - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP (with CPUs) strategy on a cluster diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index f109f555f5..91a6dee484 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -16,22 +16,23 @@ from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib -import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import ( + all_gather_ddp_if_available, find_free_network_port, rank_zero_only, rank_zero_warn, sync_ddp_if_available, - all_gather_ddp_if_available, ) if HYDRA_AVAILABLE: @@ -41,7 +42,11 @@ if HYDRA_AVAILABLE: class DDPCPUSpawnAccelerator(Accelerator): - def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer, + nprocs: int, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn @@ -197,8 +202,8 @@ class DDPCPUSpawnAccelerator(Accelerator): def early_stopping_should_stop(self, pl_module): stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() + torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM) + torch_distrib.barrier() should_stop = stop == self.trainer.world_size return should_stop diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index 5f09189e8b..ec4c087998 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -21,11 +21,13 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available, all_gather_ddp_if_available +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available if HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig @@ -34,7 +36,10 @@ if HYDRA_AVAILABLE: class DDPHPCAccelerator(Accelerator): - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP on an HPC cluster diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index d768c3b6fb..a49e17fc0b 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -17,24 +17,25 @@ from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib -import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed import LightningDistributed +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.distributed import ( + all_gather_ddp_if_available, find_free_network_port, rank_zero_only, rank_zero_warn, sync_ddp_if_available, - all_gather_ddp_if_available, ) from pytorch_lightning.utilities.seed import seed_everything @@ -45,7 +46,11 @@ if HYDRA_AVAILABLE: class DDPSpawnAccelerator(Accelerator): - def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + def __init__(self, + trainer, + nprocs: int, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): """ Runs training using DDP using mp.spawn via manual launch (not cluster launch) @@ -226,8 +231,8 @@ class DDPSpawnAccelerator(Accelerator): def early_stopping_should_stop(self, pl_module): stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() + torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM) + torch_distrib.barrier() should_stop = stop == self.trainer.world_size return should_stop diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index a3563e6a3a..834a920b50 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union +from typing import Optional, Union import torch from torch import optim +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import Result from pytorch_lightning.distributed import LightningDistributed @@ -27,7 +29,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException class DataParallelAccelerator(Accelerator): - def __init__(self, trainer, cluster_environment=None): + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): """ Runs training using DP via manual start (not HPC cluster) diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index f4d31213c7..1310777e0d 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -15,7 +15,9 @@ from typing import Any, Callable, Optional, Union import torch +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType @@ -23,7 +25,7 @@ from pytorch_lightning.utilities import AMPType class GPUAccelerator(Accelerator): amp_backend: AMPType - def __init__(self, trainer, cluster_environment=None): + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): """ Runs training using a single GPU diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 6582e3b376..5895025673 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -17,7 +17,9 @@ from typing import Any, Optional, Union, Callable import torch from torch.optim.lr_scheduler import _LRScheduler +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import HOROVOD_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import rank_zero_only @@ -28,7 +30,7 @@ if HOROVOD_AVAILABLE: class HorovodAccelerator(Accelerator): amp_backend: AMPType - def __init__(self, trainer, cluster_environment=None): + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): """ Runs training using horovod diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 74fd201df8..9d1eec5594 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -22,6 +22,7 @@ from torch.optim import Optimizer from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities import ( @@ -43,7 +44,7 @@ if TPU_AVAILABLE: class TPUAccelerator(Accelerator): - def __init__(self, trainer, cluster_environment=None): + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): """ Runs training using TPUs (colab, single machine or pod)