diff --git a/CHANGELOG.md b/CHANGELOG.md index 4440036e9f..c562604e73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -261,7 +261,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994)) -- +- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 35bb6f0847..cc9cd4937c 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -527,8 +527,8 @@ class DeepSpeedPlugin(DDPPlugin): deepspeed.checkpointing.configure( mpu_=None, partition_activations=checkpoint_config.get("partition_activations"), - contiguous_checkpointing=checkpoint_config.get("contiguous_checkpointing"), - checkpoint_in_cpu=checkpoint_config.get("checkpoint_in_cpu"), + contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"), + checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"), profile=checkpoint_config.get("profile"), ) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7cca6f6724..4b56e9d389 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -361,6 +361,36 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config["synchronize_checkpoint_boundary"] +@RunIf(min_gpus=1, deepspeed=True, standalone=True) +def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): + """Ensure if we modify the activation checkpointing parameters, we pass these to + deepspeed.checkpointing.configure correctly.""" + ds = DeepSpeedPlugin( + partition_activations=True, + cpu_checkpointing=True, + contiguous_memory_optimization=True, + synchronize_checkpoint_boundary=True, + ) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + fast_dev_run=1, + strategy=ds, + precision=16, + gpus=1, + ) + with mock.patch( + "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure + ) as deepspeed_checkpointing_configure: + trainer.fit(model) + + deepspeed_checkpointing_configure.assert_called_with( + mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None + ) + + @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config."""