Fix deepspeed keeping old sub-folders in same ckpt path (#12194)
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
parent
c822a6ac2d
commit
0a53e15759
|
@ -835,6 +835,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
- Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131))
|
||||
|
||||
|
||||
- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/PyTorchLightning/pytorch-lightning/pull/12194))
|
||||
|
||||
|
||||
- Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/PyTorchLightning/pytorch-lightning/pull/12224))
|
||||
|
||||
|
||||
|
|
|
@ -760,6 +760,7 @@ class DeepSpeedStrategy(DDPStrategy):
|
|||
"`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
|
||||
f" is not supported for `{self.__class__.__name__}` as `CheckpointIO` is not used."
|
||||
)
|
||||
|
||||
if self.zero_stage_3 and self._multi_device and self.is_global_zero:
|
||||
warning_cache.warn(
|
||||
"When saving the DeepSpeed Stage 3 checkpoint, "
|
||||
|
@ -772,7 +773,7 @@ class DeepSpeedStrategy(DDPStrategy):
|
|||
# dump states as a checkpoint dictionary object
|
||||
_exclude_keys = ["state_dict", "optimizer_states"]
|
||||
checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
|
||||
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint)
|
||||
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
|
||||
|
||||
def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
|
||||
if self.load_full_weights and self.zero_stage_3:
|
||||
|
|
|
@ -1085,3 +1085,25 @@ def test_deepspeed_with_meta_device(tmpdir):
|
|||
)
|
||||
trainer.fit(model)
|
||||
assert model.layer.weight.device.type == "cpu"
|
||||
|
||||
|
||||
@RunIf(min_gpus=2, deepspeed=True, standalone=True)
|
||||
def test_deepspeed_multi_save_same_filepath(tmpdir):
|
||||
"""Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old
|
||||
sharded checkpoints."""
|
||||
model = BoringModel()
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
strategy="deepspeed",
|
||||
accelerator="gpu",
|
||||
devices=2,
|
||||
callbacks=[ModelCheckpoint(save_top_k=1, save_last=True)],
|
||||
limit_train_batches=1,
|
||||
limit_val_batches=0,
|
||||
num_sanity_val_steps=0,
|
||||
max_epochs=2,
|
||||
)
|
||||
trainer.fit(model)
|
||||
ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, "last.ckpt")
|
||||
expected = ["latest", "zero_to_fp32.py", "checkpoint"]
|
||||
assert set(expected) == set(os.listdir(ckpt_path))
|
||||
|
|
Loading…
Reference in New Issue