Remove `Strategy.on_gpu` (#11537)

This commit is contained in:
ananthsub 2022-01-19 13:27:12 -08:00 committed by GitHub
parent a57cf2a0d3
commit f41d1e5e5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 18 additions and 40 deletions

View File

@ -409,6 +409,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/PyTorchLightning/pytorch-lightning/pull/11246))
- Removed `Strategy.on_gpu` ([#11537](https://github.com/PyTorchLightning/pytorch-lightning/pull/11537))
### Fixed
- Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099))

View File

@ -296,7 +296,7 @@ class DDPStrategy(ParallelStrategy):
# In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
# Since 1.9, DDP communication hooks can work on all backends.
if _TORCH_GREATER_EQUAL_1_9 or (
_TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device
_TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device
):
register_ddp_comm_hook(
model=self.model,
@ -514,7 +514,7 @@ class DDPStrategy(ParallelStrategy):
if self.sync_batchnorm:
self.model = _revert_sync_batchnorm(self.model)
if self.on_gpu:
if self.root_device.type == "cuda":
# GPU teardown
log.detail(f"{self.__class__.__name__}: moving model to CPU")
self.lightning_module.cpu()

View File

@ -200,7 +200,7 @@ class DDPSpawnStrategy(ParallelStrategy):
def _register_ddp_hooks(self) -> None:
# currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
# https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
if _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device:
if _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device:
register_ddp_comm_hook(
model=self.model,
ddp_comm_state=self._ddp_comm_state,
@ -378,7 +378,7 @@ class DDPSpawnStrategy(ParallelStrategy):
if self.sync_batchnorm:
self.model = _revert_sync_batchnorm(self.model)
if self.on_gpu:
if self.root_device.type == "cuda":
# GPU teardown
self.lightning_module.cpu()
# clean up memory

View File

@ -153,7 +153,7 @@ class DataParallelStrategy(ParallelStrategy):
def teardown(self) -> None:
super().teardown()
if self.on_gpu:
if self.root_device.type == "cuda":
# GPU teardown
self.lightning_module.cpu()
# clean up memory

View File

@ -126,7 +126,7 @@ class DDPFullyShardedStrategy(DDPStrategy):
return self._process_group
def setup_distributed(self) -> None:
if not self.on_gpu:
if not self.root_device.type == "cuda":
raise MisconfigurationException(
"You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
)

View File

@ -125,13 +125,13 @@ class HorovodStrategy(ParallelStrategy):
return obj
def model_to_device(self):
if self.on_gpu:
if self.root_device.type == "cuda":
# this can potentially be removed after #8312. Not done due to lack of horovod testing
torch.cuda.set_device(self.root_device)
self.model.to(self.root_device)
def join(self):
if self.on_gpu:
if self.root_device.type == "cuda":
hvd.join(self.local_rank)
else:
hvd.join()
@ -201,7 +201,7 @@ class HorovodStrategy(ParallelStrategy):
self._exit_stack = None
# Make sure all workers have finished training before returning to the user
self.join()
if self.on_gpu:
if self.root_device.type == "cuda":
# GPU teardown
self.lightning_module.cpu()
# clean up memory

View File

@ -335,10 +335,6 @@ class IPUStrategy(ParallelStrategy):
optimizer = self.optimizers[0]
self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer)
@property
def on_gpu(self) -> bool:
return False
@property
def root_device(self) -> torch.device:
pass

View File

@ -49,10 +49,6 @@ class ParallelStrategy(Strategy, ABC):
def root_device(self) -> torch.device:
"""Return the root device."""
@property
def on_gpu(self) -> bool:
return self.root_device.type == "cuda" and torch.cuda.is_available()
@property
def on_tpu(self) -> bool:
return self.root_device.type == "xla" and _XLA_AVAILABLE
@ -103,7 +99,7 @@ class ParallelStrategy(Strategy, ABC):
def torch_distributed_backend(self):
torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND")
if torch_backend is None:
torch_backend = "nccl" if self.on_gpu else "gloo"
torch_backend = "nccl" if self.root_device.type == "cuda" else "gloo"
return torch_backend
@staticmethod

View File

@ -45,10 +45,6 @@ class SingleDeviceStrategy(Strategy):
def on_tpu(self) -> bool:
return self.root_device.type == "xla" and _XLA_AVAILABLE
@property
def on_gpu(self) -> bool:
return self.root_device.type == "cuda" and torch.cuda.is_available()
def reduce(self, tensor: Any | torch.Tensor, *args: Any, **kwargs: Any) -> Any | torch.Tensor:
"""Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only
operates with a single device, the reduction is simply the identity.
@ -90,7 +86,7 @@ class SingleDeviceStrategy(Strategy):
def teardown(self) -> None:
super().teardown()
if self.on_gpu:
if self.root_device.type == "cuda":
# GPU teardown
self.lightning_module.cpu()
# clean up memory

View File

@ -228,11 +228,6 @@ class Strategy(ABC):
return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx)
return move_data_to_device(batch, device)
@property
@abstractmethod
def on_gpu(self) -> bool:
"""Returns whether the current process is done on GPU."""
@property
@abstractmethod
def on_tpu(self) -> bool:

View File

@ -57,7 +57,6 @@ def test_ddp_cpu():
# assert training type plugin attributes for device setting
assert isinstance(trainer.strategy, DDPSpawnStrategy)
assert not trainer.strategy.on_gpu
assert not trainer.strategy.on_tpu
assert trainer.strategy.root_device == torch.device("cpu")
@ -73,7 +72,6 @@ def test_ddp_spawn_extra_parameters(tmpdir):
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp_spawn")
assert isinstance(trainer.strategy, DDPSpawnStrategy)
assert trainer.strategy.on_gpu
assert trainer.strategy.root_device == torch.device("cuda:0")
val: float = 1.0

View File

@ -39,7 +39,6 @@ def test_ddp_with_2_gpus():
trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True)
# assert training type plugin attributes for device setting
assert isinstance(trainer.strategy, DDPStrategy)
assert trainer.strategy.on_gpu
assert not trainer.strategy.on_tpu
local_rank = trainer.strategy.local_rank
assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}")

View File

@ -23,10 +23,9 @@ from tests.helpers.runif import RunIf
def test_single_cpu():
"""Tests if on_gpu and on_tpu is set correctly for single CPU strategy."""
"""Tests if on_tpu is set correctly for single CPU strategy."""
trainer = Trainer()
assert isinstance(trainer.strategy, SingleDeviceStrategy)
assert not trainer.strategy.on_gpu
assert not trainer.strategy.on_tpu
assert trainer.strategy.root_device == torch.device("cpu")
@ -44,7 +43,6 @@ def test_single_gpu():
trainer = Trainer(gpus=1, fast_dev_run=True)
# assert training strategy attributes for device setting
assert isinstance(trainer.strategy, SingleDeviceStrategy)
assert trainer.strategy.on_gpu
assert not trainer.strategy.on_tpu
assert trainer.strategy.root_device == torch.device("cuda:0")

View File

@ -96,7 +96,6 @@ def test_model_tpu_one_core():
trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnStrategy(debug=True))
# assert training strategy attributes for device setting
assert isinstance(trainer.strategy, TPUSpawnStrategy)
assert not trainer.strategy.on_gpu
assert trainer.strategy.on_tpu
assert trainer.strategy.root_device == torch.device("xla", index=1)
model = BoringModelTPU()

View File

@ -1544,13 +1544,10 @@ def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir):
@pytest.mark.skipif(_IS_WINDOWS and not _TORCH_GREATER_EQUAL_1_8, reason="torch.distributed support required")
@patch("torch.cuda.device_count", return_value=2)
@patch("torch.cuda.is_available", return_value=True)
@pytest.mark.parametrize("accelerator", ("cpu", "gpu"))
def test_spawn_predict_return_predictions(_, __, accelerator):
def test_spawn_predict_return_predictions(tmpdir):
"""Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
model = BoringModel()
trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True)
trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
assert isinstance(trainer.strategy, DDPSpawnStrategy)
with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"):
trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)