[DeepSpeed] fix flag forwarding in DeepSpeedPlugin (#10899)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sean Naren <sean@grid.ai>
Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
This commit is contained in:
jona-0 2021-12-14 15:56:08 +00:00 committed by GitHub
parent d7b6e87aeb
commit 7aee00c679
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 33 additions and 3 deletions

View File

@ -261,7 +261,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994)) - Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994))
- - Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874))
- -

View File

@ -527,8 +527,8 @@ class DeepSpeedPlugin(DDPPlugin):
deepspeed.checkpointing.configure( deepspeed.checkpointing.configure(
mpu_=None, mpu_=None,
partition_activations=checkpoint_config.get("partition_activations"), partition_activations=checkpoint_config.get("partition_activations"),
contiguous_checkpointing=checkpoint_config.get("contiguous_checkpointing"), contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"),
checkpoint_in_cpu=checkpoint_config.get("checkpoint_in_cpu"), checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"),
profile=checkpoint_config.get("profile"), profile=checkpoint_config.get("profile"),
) )

View File

@ -361,6 +361,36 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
assert checkpoint_config["synchronize_checkpoint_boundary"] assert checkpoint_config["synchronize_checkpoint_boundary"]
@RunIf(min_gpus=1, deepspeed=True, standalone=True)
def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir):
"""Ensure if we modify the activation checkpointing parameters, we pass these to
deepspeed.checkpointing.configure correctly."""
ds = DeepSpeedPlugin(
partition_activations=True,
cpu_checkpointing=True,
contiguous_memory_optimization=True,
synchronize_checkpoint_boundary=True,
)
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir,
enable_progress_bar=False,
fast_dev_run=1,
strategy=ds,
precision=16,
gpus=1,
)
with mock.patch(
"deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure
) as deepspeed_checkpointing_configure:
trainer.fit(model)
deepspeed_checkpointing_configure.assert_called_with(
mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None
)
@RunIf(min_gpus=1, deepspeed=True) @RunIf(min_gpus=1, deepspeed=True)
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
"""Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config.""" """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config."""