From d73c32ab514dc5395459eb68fe735708a80c945c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Jul 2021 13:15:41 +0200 Subject: [PATCH] move `torch.cuda.set_device()` to enable collective calls earlier in setup (#8312) --- pytorch_lightning/accelerators/gpu.py | 9 ++++++--- pytorch_lightning/plugins/training_type/ddp.py | 2 -- pytorch_lightning/plugins/training_type/deepspeed.py | 2 -- pytorch_lightning/plugins/training_type/fully_sharded.py | 1 - pytorch_lightning/plugins/training_type/single_device.py | 3 --- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 3348727a36..0592cffa1a 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -26,16 +26,19 @@ _log = logging.getLogger(__name__) class GPUAccelerator(Accelerator): """ Accelerator for GPU devices. """ + def setup_environment(self) -> None: + super().setup_environment() + if "cuda" not in str(self.root_device): + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + torch.cuda.set_device(self.root_device) + def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: """ Raises: MisconfigurationException: If the selected device is not GPU. """ - if "cuda" not in str(self.root_device): - raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") self.set_nvidia_flags(trainer.local_rank) - torch.cuda.set_device(self.root_device) return super().setup(trainer, model) def on_train_start(self) -> None: diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index a882390b78..8e4f4c0694 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -367,8 +367,6 @@ class DDPPlugin(ParallelPlugin): prepare_for_backward(self.model, closure_loss) def model_to_device(self): - if self.root_device.type == "cuda": - torch.cuda.set_device(self.root_device) self.model.to(self.root_device) def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor: diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 4d229e4bff..e704b662fd 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -339,8 +339,6 @@ class DeepSpeedPlugin(DDPPlugin): if not self._config_initialized: self._format_config() self._config_initialized = True - if self.on_gpu: - torch.cuda.set_device(self.root_device) def pre_dispatch(self): self.init_deepspeed() diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py index 476df9be13..a02be35409 100644 --- a/pytorch_lightning/plugins/training_type/fully_sharded.py +++ b/pytorch_lightning/plugins/training_type/fully_sharded.py @@ -118,7 +118,6 @@ class DDPFullyShardedPlugin(DDPPlugin): "You selected accelerator to be `ddp_fully_sharded`, but GPU is not available." ) super().setup_distributed() - torch.cuda.set_device(self.root_device) @contextlib.contextmanager def model_sharded_context(self) -> Generator: diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index d4a328902e..c1ef9028ce 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -61,9 +61,6 @@ class SingleDevicePlugin(TrainingTypePlugin): return self.device def model_to_device(self) -> None: - if self.on_gpu: - torch.cuda.set_device(self.root_device) - self._model.to(self.root_device) def setup(self, model: torch.nn.Module) -> torch.nn.Module: