Remove `Strategy.on_gpu` (#11537)

2022-01-19 13:27:12 -08:00 · 2022-01-19 13:27:12 -08:00 · f41d1e5e5e
parent a57cf2a0d3
commit f41d1e5e5e
15 changed files with 18 additions and 40 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -409,6 +409,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

 - Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/PyTorchLightning/pytorch-lightning/pull/11246))

+
+- Removed `Strategy.on_gpu` ([#11537](https://github.com/PyTorchLightning/pytorch-lightning/pull/11537))
+
+
 ### Fixed

 - Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099))
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@ -296,7 +296,7 @@ class DDPStrategy(ParallelStrategy):
        # In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
        # Since 1.9, DDP communication hooks can work on all backends.
        if _TORCH_GREATER_EQUAL_1_9 or (
-            _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device
+            _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device
        ):
            register_ddp_comm_hook(
                model=self.model,
@ -514,7 +514,7 @@ class DDPStrategy(ParallelStrategy):
        if self.sync_batchnorm:
            self.model = _revert_sync_batchnorm(self.model)

-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # GPU teardown
            log.detail(f"{self.__class__.__name__}: moving model to CPU")
            self.lightning_module.cpu()
--- a/pytorch_lightning/strategies/ddp_spawn.py
+++ b/pytorch_lightning/strategies/ddp_spawn.py
@ -200,7 +200,7 @@ class DDPSpawnStrategy(ParallelStrategy):
    def _register_ddp_hooks(self) -> None:
        # currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
        # https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
-        if _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device:
+        if _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device:
            register_ddp_comm_hook(
                model=self.model,
                ddp_comm_state=self._ddp_comm_state,
@ -378,7 +378,7 @@ class DDPSpawnStrategy(ParallelStrategy):
        if self.sync_batchnorm:
            self.model = _revert_sync_batchnorm(self.model)

-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # GPU teardown
            self.lightning_module.cpu()
            # clean up memory
--- a/pytorch_lightning/strategies/dp.py
+++ b/pytorch_lightning/strategies/dp.py
@ -153,7 +153,7 @@ class DataParallelStrategy(ParallelStrategy):

    def teardown(self) -> None:
        super().teardown()
-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # GPU teardown
            self.lightning_module.cpu()
            # clean up memory
--- a/pytorch_lightning/strategies/fully_sharded.py
+++ b/pytorch_lightning/strategies/fully_sharded.py
@ -126,7 +126,7 @@ class DDPFullyShardedStrategy(DDPStrategy):
        return self._process_group

    def setup_distributed(self) -> None:
-        if not self.on_gpu:
+        if not self.root_device.type == "cuda":
            raise MisconfigurationException(
                "You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
            )
--- a/pytorch_lightning/strategies/horovod.py
+++ b/pytorch_lightning/strategies/horovod.py
@ -125,13 +125,13 @@ class HorovodStrategy(ParallelStrategy):
        return obj

    def model_to_device(self):
-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # this can potentially be removed after #8312. Not done due to lack of horovod testing
            torch.cuda.set_device(self.root_device)
        self.model.to(self.root_device)

    def join(self):
-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            hvd.join(self.local_rank)
        else:
            hvd.join()
@ -201,7 +201,7 @@ class HorovodStrategy(ParallelStrategy):
        self._exit_stack = None
        # Make sure all workers have finished training before returning to the user
        self.join()
-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # GPU teardown
            self.lightning_module.cpu()
            # clean up memory
--- a/pytorch_lightning/strategies/ipu.py
+++ b/pytorch_lightning/strategies/ipu.py
@ -335,10 +335,6 @@ class IPUStrategy(ParallelStrategy):
        optimizer = self.optimizers[0]
        self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer)

-    @property
-    def on_gpu(self) -> bool:
-        return False
-
    @property
    def root_device(self) -> torch.device:
        pass
--- a/pytorch_lightning/strategies/parallel.py
+++ b/pytorch_lightning/strategies/parallel.py
@ -49,10 +49,6 @@ class ParallelStrategy(Strategy, ABC):
    def root_device(self) -> torch.device:
        """Return the root device."""

-    @property
-    def on_gpu(self) -> bool:
-        return self.root_device.type == "cuda" and torch.cuda.is_available()
-
    @property
    def on_tpu(self) -> bool:
        return self.root_device.type == "xla" and _XLA_AVAILABLE
@ -103,7 +99,7 @@ class ParallelStrategy(Strategy, ABC):
    def torch_distributed_backend(self):
        torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND")
        if torch_backend is None:
-            torch_backend = "nccl" if self.on_gpu else "gloo"
+            torch_backend = "nccl" if self.root_device.type == "cuda" else "gloo"
        return torch_backend

    @staticmethod
--- a/pytorch_lightning/strategies/single_device.py
+++ b/pytorch_lightning/strategies/single_device.py
@ -45,10 +45,6 @@ class SingleDeviceStrategy(Strategy):
    def on_tpu(self) -> bool:
        return self.root_device.type == "xla" and _XLA_AVAILABLE

-    @property
-    def on_gpu(self) -> bool:
-        return self.root_device.type == "cuda" and torch.cuda.is_available()
-
    def reduce(self, tensor: Any | torch.Tensor, *args: Any, **kwargs: Any) -> Any | torch.Tensor:
        """Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only
        operates with a single device, the reduction is simply the identity.
@ -90,7 +86,7 @@ class SingleDeviceStrategy(Strategy):

    def teardown(self) -> None:
        super().teardown()
-        if self.on_gpu:
+        if self.root_device.type == "cuda":
            # GPU teardown
            self.lightning_module.cpu()
            # clean up memory
--- a/pytorch_lightning/strategies/strategy.py
+++ b/pytorch_lightning/strategies/strategy.py
@ -228,11 +228,6 @@ class Strategy(ABC):
            return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx)
        return move_data_to_device(batch, device)

-    @property
-    @abstractmethod
-    def on_gpu(self) -> bool:
-        """Returns whether the current process is done on GPU."""
-
    @property
    @abstractmethod
    def on_tpu(self) -> bool:
--- a/tests/strategies/test_ddp_spawn_strategy.py
+++ b/tests/strategies/test_ddp_spawn_strategy.py
@ -57,7 +57,6 @@ def test_ddp_cpu():
    # assert training type plugin attributes for device setting

    assert isinstance(trainer.strategy, DDPSpawnStrategy)
-    assert not trainer.strategy.on_gpu
    assert not trainer.strategy.on_tpu
    assert trainer.strategy.root_device == torch.device("cpu")

@ -73,7 +72,6 @@ def test_ddp_spawn_extra_parameters(tmpdir):
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp_spawn")

    assert isinstance(trainer.strategy, DDPSpawnStrategy)
-    assert trainer.strategy.on_gpu
    assert trainer.strategy.root_device == torch.device("cuda:0")

    val: float = 1.0
--- a/tests/strategies/test_ddp_strategy.py
+++ b/tests/strategies/test_ddp_strategy.py
@ -39,7 +39,6 @@ def test_ddp_with_2_gpus():
    trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True)
    # assert training type plugin attributes for device setting
    assert isinstance(trainer.strategy, DDPStrategy)
-    assert trainer.strategy.on_gpu
    assert not trainer.strategy.on_tpu
    local_rank = trainer.strategy.local_rank
    assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}")
--- a/tests/strategies/test_single_device_strategy.py
+++ b/tests/strategies/test_single_device_strategy.py
@ -23,10 +23,9 @@ from tests.helpers.runif import RunIf


 def test_single_cpu():
-    """Tests if on_gpu and on_tpu is set correctly for single CPU strategy."""
+    """Tests if on_tpu is set correctly for single CPU strategy."""
    trainer = Trainer()
    assert isinstance(trainer.strategy, SingleDeviceStrategy)
-    assert not trainer.strategy.on_gpu
    assert not trainer.strategy.on_tpu
    assert trainer.strategy.root_device == torch.device("cpu")

@ -44,7 +43,6 @@ def test_single_gpu():
    trainer = Trainer(gpus=1, fast_dev_run=True)
    # assert training strategy attributes for device setting
    assert isinstance(trainer.strategy, SingleDeviceStrategy)
-    assert trainer.strategy.on_gpu
    assert not trainer.strategy.on_tpu
    assert trainer.strategy.root_device == torch.device("cuda:0")

--- a/tests/strategies/test_tpu_spawn.py
+++ b/tests/strategies/test_tpu_spawn.py
@ -96,7 +96,6 @@ def test_model_tpu_one_core():
    trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnStrategy(debug=True))
    # assert training strategy attributes for device setting
    assert isinstance(trainer.strategy, TPUSpawnStrategy)
-    assert not trainer.strategy.on_gpu
    assert trainer.strategy.on_tpu
    assert trainer.strategy.root_device == torch.device("xla", index=1)
    model = BoringModelTPU()
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -1544,13 +1544,10 @@ def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir):


@pytest.mark.skipif(_IS_WINDOWS and not _TORCH_GREATER_EQUAL_1_8, reason="torch.distributed support required")
-@patch("torch.cuda.device_count", return_value=2)
-@patch("torch.cuda.is_available", return_value=True)
-@pytest.mark.parametrize("accelerator", ("cpu", "gpu"))
-def test_spawn_predict_return_predictions(_, __, accelerator):
+def test_spawn_predict_return_predictions(tmpdir):
    """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
    model = BoringModel()
-    trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True)
+    trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
    assert isinstance(trainer.strategy, DDPSpawnStrategy)
    with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"):
        trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)