reduce only loss with dp (#11594)
Co-authored-by: Aki Nitta <nitta@akihironitta.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
This commit is contained in:
parent
f509e40ae3
commit
7ec1e66e17
|
@ -259,6 +259,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
- Avoid enforcing `shuffle=False` for eval dataloaders ([#11575](https://github.com/PyTorchLightning/pytorch-lightning/pull/11575))
|
||||
|
||||
|
||||
- When using DP (data-parallel), Lightning will no longer automatically reduce all tensors returned in training_step; it will only reduce the loss unless `training_step_end` is overridden ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
|
||||
|
||||
- When using DP (data-parallel), the `training_epoch_end` hook will no longer receive reduced outputs from `training_step` and instead get the full tensor of results from all GPUs ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
|
||||
### Deprecated
|
||||
|
||||
- Deprecated `Trainer.{validated,tested,predicted}_ckpt_path` and replaced with read-only property `Trainer.ckpt_path` set when checkpoints loaded via `Trainer.{fit,validate,test,predict}` ([#11696](https://github.com/PyTorchLightning/pytorch-lightning/pull/11696))
|
||||
|
|
|
@ -137,18 +137,15 @@ class DataParallelStrategy(ParallelStrategy):
|
|||
return self.model(*args, **kwargs)
|
||||
|
||||
def training_step_end(self, output):
|
||||
if not is_overridden("training_step_end", self.lightning_module):
|
||||
return self.reduce(output)
|
||||
return output
|
||||
if is_overridden("training_step_end", self.lightning_module):
|
||||
return output
|
||||
|
||||
def validation_step_end(self, output):
|
||||
if not is_overridden("validation_step_end", self.lightning_module):
|
||||
return self.reduce(output)
|
||||
return output
|
||||
if isinstance(output, dict) and "loss" in output:
|
||||
output["loss"] = self.reduce(output["loss"])
|
||||
|
||||
elif isinstance(output, torch.Tensor):
|
||||
output = self.reduce(output)
|
||||
|
||||
def test_step_end(self, output):
|
||||
if not is_overridden("test_step_end", self.lightning_module):
|
||||
return self.reduce(output)
|
||||
return output
|
||||
|
||||
def teardown(self) -> None:
|
||||
|
|
|
@ -134,8 +134,24 @@ class ReductionTestModel(BoringModel):
|
|||
|
||||
def training_epoch_end(self, outputs):
|
||||
assert outputs[0]["loss"].shape == torch.Size([])
|
||||
assert outputs[0]["reduce_int"].item() == 0 # mean([0, 1]) = 0
|
||||
assert outputs[0]["reduce_float"].item() == 0.5 # mean([0., 1.]) = 0.5
|
||||
self._assert_extra_outputs(outputs)
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
assert outputs[0]["x"].shape == torch.Size([2])
|
||||
self._assert_extra_outputs(outputs)
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
assert outputs[0]["y"].shape == torch.Size([2])
|
||||
self._assert_extra_outputs(outputs)
|
||||
|
||||
def _assert_extra_outputs(self, outputs):
|
||||
out = outputs[0]["reduce_int"]
|
||||
assert torch.eq(out, torch.tensor([0, 1], device="cuda:0")).all()
|
||||
assert out.dtype is torch.int
|
||||
|
||||
out = outputs[0]["reduce_float"]
|
||||
assert torch.eq(out, torch.tensor([0.0, 1.0], device="cuda:0")).all()
|
||||
assert out.dtype is torch.float
|
||||
|
||||
|
||||
def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch):
|
||||
|
@ -188,11 +204,9 @@ def test_dp_training_step_dict(tmpdir):
|
|||
|
||||
trainer = pl.Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
limit_train_batches=1,
|
||||
limit_val_batches=1,
|
||||
limit_test_batches=1,
|
||||
fast_dev_run=True,
|
||||
gpus=2,
|
||||
strategy="dp",
|
||||
)
|
||||
trainer.fit(model)
|
||||
trainer.test(model)
|
||||
|
|
Loading…
Reference in New Issue