Remove `Strategy.on_gpu` (#11537)
This commit is contained in:
parent
a57cf2a0d3
commit
f41d1e5e5e
|
@ -409,6 +409,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/PyTorchLightning/pytorch-lightning/pull/11246))
|
||||
|
||||
|
||||
- Removed `Strategy.on_gpu` ([#11537](https://github.com/PyTorchLightning/pytorch-lightning/pull/11537))
|
||||
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099))
|
||||
|
|
|
@ -296,7 +296,7 @@ class DDPStrategy(ParallelStrategy):
|
|||
# In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
|
||||
# Since 1.9, DDP communication hooks can work on all backends.
|
||||
if _TORCH_GREATER_EQUAL_1_9 or (
|
||||
_TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device
|
||||
_TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device
|
||||
):
|
||||
register_ddp_comm_hook(
|
||||
model=self.model,
|
||||
|
@ -514,7 +514,7 @@ class DDPStrategy(ParallelStrategy):
|
|||
if self.sync_batchnorm:
|
||||
self.model = _revert_sync_batchnorm(self.model)
|
||||
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# GPU teardown
|
||||
log.detail(f"{self.__class__.__name__}: moving model to CPU")
|
||||
self.lightning_module.cpu()
|
||||
|
|
|
@ -200,7 +200,7 @@ class DDPSpawnStrategy(ParallelStrategy):
|
|||
def _register_ddp_hooks(self) -> None:
|
||||
# currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
|
||||
# https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
|
||||
if _TORCH_GREATER_EQUAL_1_8 and self.on_gpu and self._is_single_process_single_device:
|
||||
if _TORCH_GREATER_EQUAL_1_8 and self.root_device.type == "cuda" and self._is_single_process_single_device:
|
||||
register_ddp_comm_hook(
|
||||
model=self.model,
|
||||
ddp_comm_state=self._ddp_comm_state,
|
||||
|
@ -378,7 +378,7 @@ class DDPSpawnStrategy(ParallelStrategy):
|
|||
if self.sync_batchnorm:
|
||||
self.model = _revert_sync_batchnorm(self.model)
|
||||
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# GPU teardown
|
||||
self.lightning_module.cpu()
|
||||
# clean up memory
|
||||
|
|
|
@ -153,7 +153,7 @@ class DataParallelStrategy(ParallelStrategy):
|
|||
|
||||
def teardown(self) -> None:
|
||||
super().teardown()
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# GPU teardown
|
||||
self.lightning_module.cpu()
|
||||
# clean up memory
|
||||
|
|
|
@ -126,7 +126,7 @@ class DDPFullyShardedStrategy(DDPStrategy):
|
|||
return self._process_group
|
||||
|
||||
def setup_distributed(self) -> None:
|
||||
if not self.on_gpu:
|
||||
if not self.root_device.type == "cuda":
|
||||
raise MisconfigurationException(
|
||||
"You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
|
||||
)
|
||||
|
|
|
@ -125,13 +125,13 @@ class HorovodStrategy(ParallelStrategy):
|
|||
return obj
|
||||
|
||||
def model_to_device(self):
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# this can potentially be removed after #8312. Not done due to lack of horovod testing
|
||||
torch.cuda.set_device(self.root_device)
|
||||
self.model.to(self.root_device)
|
||||
|
||||
def join(self):
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
hvd.join(self.local_rank)
|
||||
else:
|
||||
hvd.join()
|
||||
|
@ -201,7 +201,7 @@ class HorovodStrategy(ParallelStrategy):
|
|||
self._exit_stack = None
|
||||
# Make sure all workers have finished training before returning to the user
|
||||
self.join()
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# GPU teardown
|
||||
self.lightning_module.cpu()
|
||||
# clean up memory
|
||||
|
|
|
@ -335,10 +335,6 @@ class IPUStrategy(ParallelStrategy):
|
|||
optimizer = self.optimizers[0]
|
||||
self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer)
|
||||
|
||||
@property
|
||||
def on_gpu(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def root_device(self) -> torch.device:
|
||||
pass
|
||||
|
|
|
@ -49,10 +49,6 @@ class ParallelStrategy(Strategy, ABC):
|
|||
def root_device(self) -> torch.device:
|
||||
"""Return the root device."""
|
||||
|
||||
@property
|
||||
def on_gpu(self) -> bool:
|
||||
return self.root_device.type == "cuda" and torch.cuda.is_available()
|
||||
|
||||
@property
|
||||
def on_tpu(self) -> bool:
|
||||
return self.root_device.type == "xla" and _XLA_AVAILABLE
|
||||
|
@ -103,7 +99,7 @@ class ParallelStrategy(Strategy, ABC):
|
|||
def torch_distributed_backend(self):
|
||||
torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND")
|
||||
if torch_backend is None:
|
||||
torch_backend = "nccl" if self.on_gpu else "gloo"
|
||||
torch_backend = "nccl" if self.root_device.type == "cuda" else "gloo"
|
||||
return torch_backend
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -45,10 +45,6 @@ class SingleDeviceStrategy(Strategy):
|
|||
def on_tpu(self) -> bool:
|
||||
return self.root_device.type == "xla" and _XLA_AVAILABLE
|
||||
|
||||
@property
|
||||
def on_gpu(self) -> bool:
|
||||
return self.root_device.type == "cuda" and torch.cuda.is_available()
|
||||
|
||||
def reduce(self, tensor: Any | torch.Tensor, *args: Any, **kwargs: Any) -> Any | torch.Tensor:
|
||||
"""Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only
|
||||
operates with a single device, the reduction is simply the identity.
|
||||
|
@ -90,7 +86,7 @@ class SingleDeviceStrategy(Strategy):
|
|||
|
||||
def teardown(self) -> None:
|
||||
super().teardown()
|
||||
if self.on_gpu:
|
||||
if self.root_device.type == "cuda":
|
||||
# GPU teardown
|
||||
self.lightning_module.cpu()
|
||||
# clean up memory
|
||||
|
|
|
@ -228,11 +228,6 @@ class Strategy(ABC):
|
|||
return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx)
|
||||
return move_data_to_device(batch, device)
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def on_gpu(self) -> bool:
|
||||
"""Returns whether the current process is done on GPU."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def on_tpu(self) -> bool:
|
||||
|
|
|
@ -57,7 +57,6 @@ def test_ddp_cpu():
|
|||
# assert training type plugin attributes for device setting
|
||||
|
||||
assert isinstance(trainer.strategy, DDPSpawnStrategy)
|
||||
assert not trainer.strategy.on_gpu
|
||||
assert not trainer.strategy.on_tpu
|
||||
assert trainer.strategy.root_device == torch.device("cpu")
|
||||
|
||||
|
@ -73,7 +72,6 @@ def test_ddp_spawn_extra_parameters(tmpdir):
|
|||
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp_spawn")
|
||||
|
||||
assert isinstance(trainer.strategy, DDPSpawnStrategy)
|
||||
assert trainer.strategy.on_gpu
|
||||
assert trainer.strategy.root_device == torch.device("cuda:0")
|
||||
|
||||
val: float = 1.0
|
||||
|
|
|
@ -39,7 +39,6 @@ def test_ddp_with_2_gpus():
|
|||
trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True)
|
||||
# assert training type plugin attributes for device setting
|
||||
assert isinstance(trainer.strategy, DDPStrategy)
|
||||
assert trainer.strategy.on_gpu
|
||||
assert not trainer.strategy.on_tpu
|
||||
local_rank = trainer.strategy.local_rank
|
||||
assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}")
|
||||
|
|
|
@ -23,10 +23,9 @@ from tests.helpers.runif import RunIf
|
|||
|
||||
|
||||
def test_single_cpu():
|
||||
"""Tests if on_gpu and on_tpu is set correctly for single CPU strategy."""
|
||||
"""Tests if on_tpu is set correctly for single CPU strategy."""
|
||||
trainer = Trainer()
|
||||
assert isinstance(trainer.strategy, SingleDeviceStrategy)
|
||||
assert not trainer.strategy.on_gpu
|
||||
assert not trainer.strategy.on_tpu
|
||||
assert trainer.strategy.root_device == torch.device("cpu")
|
||||
|
||||
|
@ -44,7 +43,6 @@ def test_single_gpu():
|
|||
trainer = Trainer(gpus=1, fast_dev_run=True)
|
||||
# assert training strategy attributes for device setting
|
||||
assert isinstance(trainer.strategy, SingleDeviceStrategy)
|
||||
assert trainer.strategy.on_gpu
|
||||
assert not trainer.strategy.on_tpu
|
||||
assert trainer.strategy.root_device == torch.device("cuda:0")
|
||||
|
||||
|
|
|
@ -96,7 +96,6 @@ def test_model_tpu_one_core():
|
|||
trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnStrategy(debug=True))
|
||||
# assert training strategy attributes for device setting
|
||||
assert isinstance(trainer.strategy, TPUSpawnStrategy)
|
||||
assert not trainer.strategy.on_gpu
|
||||
assert trainer.strategy.on_tpu
|
||||
assert trainer.strategy.root_device == torch.device("xla", index=1)
|
||||
model = BoringModelTPU()
|
||||
|
|
|
@ -1544,13 +1544,10 @@ def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir):
|
|||
|
||||
|
||||
@pytest.mark.skipif(_IS_WINDOWS and not _TORCH_GREATER_EQUAL_1_8, reason="torch.distributed support required")
|
||||
@patch("torch.cuda.device_count", return_value=2)
|
||||
@patch("torch.cuda.is_available", return_value=True)
|
||||
@pytest.mark.parametrize("accelerator", ("cpu", "gpu"))
|
||||
def test_spawn_predict_return_predictions(_, __, accelerator):
|
||||
def test_spawn_predict_return_predictions(tmpdir):
|
||||
"""Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
|
||||
model = BoringModel()
|
||||
trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True)
|
||||
trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
|
||||
assert isinstance(trainer.strategy, DDPSpawnStrategy)
|
||||
with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"):
|
||||
trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)
|
||||
|
|
Loading…
Reference in New Issue