Fix deepspeed keeping old sub-folders in same ckpt path (#12194)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
Rohit Gupta 2022-03-23 19:06:13 +05:30 committed by GitHub
parent c822a6ac2d
commit 0a53e15759
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 27 additions and 1 deletions

View File

@ -835,6 +835,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131))
- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/PyTorchLightning/pytorch-lightning/pull/12194))
- Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/PyTorchLightning/pytorch-lightning/pull/12224))

View File

@ -760,6 +760,7 @@ class DeepSpeedStrategy(DDPStrategy):
"`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
f" is not supported for `{self.__class__.__name__}` as `CheckpointIO` is not used."
)
if self.zero_stage_3 and self._multi_device and self.is_global_zero:
warning_cache.warn(
"When saving the DeepSpeed Stage 3 checkpoint, "
@ -772,7 +773,7 @@ class DeepSpeedStrategy(DDPStrategy):
# dump states as a checkpoint dictionary object
_exclude_keys = ["state_dict", "optimizer_states"]
checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint)
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
if self.load_full_weights and self.zero_stage_3:

View File

@ -1085,3 +1085,25 @@ def test_deepspeed_with_meta_device(tmpdir):
)
trainer.fit(model)
assert model.layer.weight.device.type == "cpu"
@RunIf(min_gpus=2, deepspeed=True, standalone=True)
def test_deepspeed_multi_save_same_filepath(tmpdir):
"""Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old
sharded checkpoints."""
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir,
strategy="deepspeed",
accelerator="gpu",
devices=2,
callbacks=[ModelCheckpoint(save_top_k=1, save_last=True)],
limit_train_batches=1,
limit_val_batches=0,
num_sanity_val_steps=0,
max_epochs=2,
)
trainer.fit(model)
ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, "last.ckpt")
expected = ["latest", "zero_to_fp32.py", "checkpoint"]
assert set(expected) == set(os.listdir(ckpt_path))