Fix deepspeed keeping old sub-folders in same ckpt path (#12194)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
2022-03-23 19:06:13 +05:30 · 2022-03-23 19:06:13 +05:30 · 0a53e15759
parent c822a6ac2d
commit 0a53e15759
3 changed files with 27 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -835,6 +835,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131))


+- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/PyTorchLightning/pytorch-lightning/pull/12194))
+
+
 - Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/PyTorchLightning/pytorch-lightning/pull/12224))


--- a/pytorch_lightning/strategies/deepspeed.py
+++ b/pytorch_lightning/strategies/deepspeed.py
@ -760,6 +760,7 @@ class DeepSpeedStrategy(DDPStrategy):
                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
                f" is not supported for `{self.__class__.__name__}` as `CheckpointIO` is not used."
            )
+
        if self.zero_stage_3 and self._multi_device and self.is_global_zero:
            warning_cache.warn(
                "When saving the DeepSpeed Stage 3 checkpoint, "
@ -772,7 +773,7 @@ class DeepSpeedStrategy(DDPStrategy):
        # dump states as a checkpoint dictionary object
        _exclude_keys = ["state_dict", "optimizer_states"]
        checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
-        self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint)
+        self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")

    def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
        if self.load_full_weights and self.zero_stage_3:
--- a/tests/strategies/test_deepspeed_strategy.py
+++ b/tests/strategies/test_deepspeed_strategy.py
@ -1085,3 +1085,25 @@ def test_deepspeed_with_meta_device(tmpdir):
    )
    trainer.fit(model)
    assert model.layer.weight.device.type == "cpu"
+
+
+@RunIf(min_gpus=2, deepspeed=True, standalone=True)
+def test_deepspeed_multi_save_same_filepath(tmpdir):
+    """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old
+    sharded checkpoints."""
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        strategy="deepspeed",
+        accelerator="gpu",
+        devices=2,
+        callbacks=[ModelCheckpoint(save_top_k=1, save_last=True)],
+        limit_train_batches=1,
+        limit_val_batches=0,
+        num_sanity_val_steps=0,
+        max_epochs=2,
+    )
+    trainer.fit(model)
+    ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, "last.ckpt")
+    expected = ["latest", "zero_to_fp32.py", "checkpoint"]
+    assert set(expected) == set(os.listdir(ckpt_path))