diff --git a/CHANGELOG.md b/CHANGELOG.md index bd92a19c11..927d71bf9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -835,6 +835,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131)) +- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/PyTorchLightning/pytorch-lightning/pull/12194)) + + - Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/PyTorchLightning/pytorch-lightning/pull/12224)) diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index 133dc1ec75..bdec69c43b 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -760,6 +760,7 @@ class DeepSpeedStrategy(DDPStrategy): "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg" f" is not supported for `{self.__class__.__name__}` as `CheckpointIO` is not used." ) + if self.zero_stage_3 and self._multi_device and self.is_global_zero: warning_cache.warn( "When saving the DeepSpeed Stage 3 checkpoint, " @@ -772,7 +773,7 @@ class DeepSpeedStrategy(DDPStrategy): # dump states as a checkpoint dictionary object _exclude_keys = ["state_dict", "optimizer_states"] checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys} - self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint) + self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint") def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: if self.load_full_weights and self.zero_stage_3: diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py index 272da2d3b9..6891c0b397 100644 --- a/tests/strategies/test_deepspeed_strategy.py +++ b/tests/strategies/test_deepspeed_strategy.py @@ -1085,3 +1085,25 @@ def test_deepspeed_with_meta_device(tmpdir): ) trainer.fit(model) assert model.layer.weight.device.type == "cpu" + + +@RunIf(min_gpus=2, deepspeed=True, standalone=True) +def test_deepspeed_multi_save_same_filepath(tmpdir): + """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old + sharded checkpoints.""" + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + strategy="deepspeed", + accelerator="gpu", + devices=2, + callbacks=[ModelCheckpoint(save_top_k=1, save_last=True)], + limit_train_batches=1, + limit_val_batches=0, + num_sanity_val_steps=0, + max_epochs=2, + ) + trainer.fit(model) + ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, "last.ckpt") + expected = ["latest", "zero_to_fp32.py", "checkpoint"] + assert set(expected) == set(os.listdir(ckpt_path))