From f41d1e5e5ebb7040a39d137695e818cada9a9234 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 19 Jan 2022 13:27:12 -0800 Subject: [PATCH] Remove `Strategy.on_gpu` (#11537) --- CHANGELOG.md | 4 ++++ pytorch_lightning/strategies/ddp.py | 4 ++-- pytorch_lightning/strategies/ddp_spawn.py | 4 ++-- pytorch_lightning/strategies/dp.py | 2 +- pytorch_lightning/strategies/fully_sharded.py | 2 +- pytorch_lightning/strategies/horovod.py | 6 +++--- pytorch_lightning/strategies/ipu.py | 4 ---- pytorch_lightning/strategies/parallel.py | 6 +----- pytorch_lightning/strategies/single_device.py | 6 +----- pytorch_lightning/strategies/strategy.py | 5 ----- tests/strategies/test_ddp_spawn_strategy.py | 2 -- tests/strategies/test_ddp_strategy.py | 1 - tests/strategies/test_single_device_strategy.py | 4 +--- tests/strategies/test_tpu_spawn.py | 1 - tests/trainer/test_trainer.py | 7 ++----- 15 files changed, 18 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b203856de1..8f76eb75be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -409,6 +409,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/PyTorchLightning/pytorch-lightning/pull/11246)) + +- Removed `Strategy.on_gpu` ([#11537](https://github.com/PyTorchLightning/pytorch-lightning/pull/11537)) + + ### Fixed - Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099)) diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 976938a726..4aa67baaed 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -296,7 +296,7 @@ class DDPStrategy(ParallelStrategy): # In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode # Since 1.9, DDP communication hooks can work on all backends. if _TORCH_GREATER_EQUAL_1_9 or ( - _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device + _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device ): register_ddp_comm_hook( model=self.model, @@ -514,7 +514,7 @@ class DDPStrategy(ParallelStrategy): if self.sync_batchnorm: self.model = _revert_sync_batchnorm(self.model) - if self.on_gpu: + if self.root_device.type == "cuda": # GPU teardown log.detail(f"{self.__class__.__name__}: moving model to CPU") self.lightning_module.cpu() diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 4f95c31545..097992dc19 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -200,7 +200,7 @@ class DDPSpawnStrategy(ParallelStrategy): def _register_ddp_hooks(self) -> None: # currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode # https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084 - if _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device: + if _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device: register_ddp_comm_hook( model=self.model, ddp_comm_state=self._ddp_comm_state, @@ -378,7 +378,7 @@ class DDPSpawnStrategy(ParallelStrategy): if self.sync_batchnorm: self.model = _revert_sync_batchnorm(self.model) - if self.on_gpu: + if self.root_device.type == "cuda": # GPU teardown self.lightning_module.cpu() # clean up memory diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index 300360d085..71d0090e2c 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -153,7 +153,7 @@ class DataParallelStrategy(ParallelStrategy): def teardown(self) -> None: super().teardown() - if self.on_gpu: + if self.root_device.type == "cuda": # GPU teardown self.lightning_module.cpu() # clean up memory diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index f99fa3462d..a3c00a03b8 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -126,7 +126,7 @@ class DDPFullyShardedStrategy(DDPStrategy): return self._process_group def setup_distributed(self) -> None: - if not self.on_gpu: + if not self.root_device.type == "cuda": raise MisconfigurationException( "You selected strategy to be `ddp_fully_sharded`, but GPU is not available." ) diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 19fa1ca3d2..8b68fdd156 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -125,13 +125,13 @@ class HorovodStrategy(ParallelStrategy): return obj def model_to_device(self): - if self.on_gpu: + if self.root_device.type == "cuda": # this can potentially be removed after #8312. Not done due to lack of horovod testing torch.cuda.set_device(self.root_device) self.model.to(self.root_device) def join(self): - if self.on_gpu: + if self.root_device.type == "cuda": hvd.join(self.local_rank) else: hvd.join() @@ -201,7 +201,7 @@ class HorovodStrategy(ParallelStrategy): self._exit_stack = None # Make sure all workers have finished training before returning to the user self.join() - if self.on_gpu: + if self.root_device.type == "cuda": # GPU teardown self.lightning_module.cpu() # clean up memory diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 7044082a78..22b575a590 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -335,10 +335,6 @@ class IPUStrategy(ParallelStrategy): optimizer = self.optimizers[0] self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer) - @property - def on_gpu(self) -> bool: - return False - @property def root_device(self) -> torch.device: pass diff --git a/pytorch_lightning/strategies/parallel.py b/pytorch_lightning/strategies/parallel.py index ac50cc028e..9c7d94a52f 100644 --- a/pytorch_lightning/strategies/parallel.py +++ b/pytorch_lightning/strategies/parallel.py @@ -49,10 +49,6 @@ class ParallelStrategy(Strategy, ABC): def root_device(self) -> torch.device: """Return the root device.""" - @property - def on_gpu(self) -> bool: - return self.root_device.type == "cuda" and torch.cuda.is_available() - @property def on_tpu(self) -> bool: return self.root_device.type == "xla" and _XLA_AVAILABLE @@ -103,7 +99,7 @@ class ParallelStrategy(Strategy, ABC): def torch_distributed_backend(self): torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND") if torch_backend is None: - torch_backend = "nccl" if self.on_gpu else "gloo" + torch_backend = "nccl" if self.root_device.type == "cuda" else "gloo" return torch_backend @staticmethod diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index bccbfa13fa..18bf619420 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -45,10 +45,6 @@ class SingleDeviceStrategy(Strategy): def on_tpu(self) -> bool: return self.root_device.type == "xla" and _XLA_AVAILABLE - @property - def on_gpu(self) -> bool: - return self.root_device.type == "cuda" and torch.cuda.is_available() - def reduce(self, tensor: Any | torch.Tensor, *args: Any, **kwargs: Any) -> Any | torch.Tensor: """Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only operates with a single device, the reduction is simply the identity. @@ -90,7 +86,7 @@ class SingleDeviceStrategy(Strategy): def teardown(self) -> None: super().teardown() - if self.on_gpu: + if self.root_device.type == "cuda": # GPU teardown self.lightning_module.cpu() # clean up memory diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index 5019890ad4..f966b926b8 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -228,11 +228,6 @@ class Strategy(ABC): return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx) return move_data_to_device(batch, device) - @property - @abstractmethod - def on_gpu(self) -> bool: - """Returns whether the current process is done on GPU.""" - @property @abstractmethod def on_tpu(self) -> bool: diff --git a/tests/strategies/test_ddp_spawn_strategy.py b/tests/strategies/test_ddp_spawn_strategy.py index 25bb66661c..0fbe0d62cf 100644 --- a/tests/strategies/test_ddp_spawn_strategy.py +++ b/tests/strategies/test_ddp_spawn_strategy.py @@ -57,7 +57,6 @@ def test_ddp_cpu(): # assert training type plugin attributes for device setting assert isinstance(trainer.strategy, DDPSpawnStrategy) - assert not trainer.strategy.on_gpu assert not trainer.strategy.on_tpu assert trainer.strategy.root_device == torch.device("cpu") @@ -73,7 +72,6 @@ def test_ddp_spawn_extra_parameters(tmpdir): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp_spawn") assert isinstance(trainer.strategy, DDPSpawnStrategy) - assert trainer.strategy.on_gpu assert trainer.strategy.root_device == torch.device("cuda:0") val: float = 1.0 diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py index fa47380c7d..e56ad131bd 100644 --- a/tests/strategies/test_ddp_strategy.py +++ b/tests/strategies/test_ddp_strategy.py @@ -39,7 +39,6 @@ def test_ddp_with_2_gpus(): trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True) # assert training type plugin attributes for device setting assert isinstance(trainer.strategy, DDPStrategy) - assert trainer.strategy.on_gpu assert not trainer.strategy.on_tpu local_rank = trainer.strategy.local_rank assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}") diff --git a/tests/strategies/test_single_device_strategy.py b/tests/strategies/test_single_device_strategy.py index 56b776d402..0007d72129 100644 --- a/tests/strategies/test_single_device_strategy.py +++ b/tests/strategies/test_single_device_strategy.py @@ -23,10 +23,9 @@ from tests.helpers.runif import RunIf def test_single_cpu(): - """Tests if on_gpu and on_tpu is set correctly for single CPU strategy.""" + """Tests if on_tpu is set correctly for single CPU strategy.""" trainer = Trainer() assert isinstance(trainer.strategy, SingleDeviceStrategy) - assert not trainer.strategy.on_gpu assert not trainer.strategy.on_tpu assert trainer.strategy.root_device == torch.device("cpu") @@ -44,7 +43,6 @@ def test_single_gpu(): trainer = Trainer(gpus=1, fast_dev_run=True) # assert training strategy attributes for device setting assert isinstance(trainer.strategy, SingleDeviceStrategy) - assert trainer.strategy.on_gpu assert not trainer.strategy.on_tpu assert trainer.strategy.root_device == torch.device("cuda:0") diff --git a/tests/strategies/test_tpu_spawn.py b/tests/strategies/test_tpu_spawn.py index b62e175c23..26b59bad54 100644 --- a/tests/strategies/test_tpu_spawn.py +++ b/tests/strategies/test_tpu_spawn.py @@ -96,7 +96,6 @@ def test_model_tpu_one_core(): trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnStrategy(debug=True)) # assert training strategy attributes for device setting assert isinstance(trainer.strategy, TPUSpawnStrategy) - assert not trainer.strategy.on_gpu assert trainer.strategy.on_tpu assert trainer.strategy.root_device == torch.device("xla", index=1) model = BoringModelTPU() diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index a50570d0ae..97793cbe39 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1544,13 +1544,10 @@ def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir): @pytest.mark.skipif(_IS_WINDOWS and not _TORCH_GREATER_EQUAL_1_8, reason="torch.distributed support required") -@patch("torch.cuda.device_count", return_value=2) -@patch("torch.cuda.is_available", return_value=True) -@pytest.mark.parametrize("accelerator", ("cpu", "gpu")) -def test_spawn_predict_return_predictions(_, __, accelerator): +def test_spawn_predict_return_predictions(tmpdir): """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins.""" model = BoringModel() - trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True) + trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True) assert isinstance(trainer.strategy, DDPSpawnStrategy) with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"): trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)