2021-08-04 09:39:02 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
import os
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
from pytorch_lightning import Trainer
|
2021-12-21 15:18:01 +00:00
|
|
|
from pytorch_lightning.plugins import DeepSpeedStrategy
|
2021-08-04 09:39:02 +00:00
|
|
|
from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict
|
|
|
|
from tests.helpers.boring_model import BoringModel
|
|
|
|
from tests.helpers.runif import RunIf
|
|
|
|
|
|
|
|
|
2021-11-26 17:13:14 +00:00
|
|
|
@RunIf(min_gpus=2, deepspeed=True, standalone=True)
|
2021-08-04 09:39:02 +00:00
|
|
|
def test_deepspeed_collate_checkpoint(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test to ensure that with DeepSpeed Stage 3 we can collate the sharded checkpoints into a single file."""
|
2021-08-04 09:39:02 +00:00
|
|
|
model = BoringModel()
|
|
|
|
trainer = Trainer(
|
2021-12-21 15:18:01 +00:00
|
|
|
default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), gpus=2, fast_dev_run=True, precision=16
|
2021-08-04 09:39:02 +00:00
|
|
|
)
|
|
|
|
trainer.fit(model)
|
|
|
|
checkpoint_path = os.path.join(tmpdir, "model.pt")
|
2021-12-22 02:11:43 +00:00
|
|
|
checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
|
2021-08-04 09:39:02 +00:00
|
|
|
trainer.save_checkpoint(checkpoint_path)
|
2021-12-22 02:11:43 +00:00
|
|
|
trainer.strategy.barrier()
|
2021-08-04 09:39:02 +00:00
|
|
|
if trainer.is_global_zero:
|
|
|
|
# ensure function call works
|
|
|
|
output_path = os.path.join(tmpdir, "single_model.pt")
|
|
|
|
convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, output_path)
|
|
|
|
_assert_checkpoint_equal(model, output_path)
|
|
|
|
|
|
|
|
|
|
|
|
def _assert_checkpoint_equal(model, output_path):
|
|
|
|
assert os.path.exists(output_path)
|
|
|
|
single_output = torch.load(output_path)
|
|
|
|
state_dict = model.state_dict()
|
|
|
|
for orig_param, saved_model_param in zip(state_dict.values(), single_output["state_dict"].values()):
|
|
|
|
if model.dtype == torch.half:
|
|
|
|
# moved model to float32 for comparison with single fp32 saved weights
|
|
|
|
saved_model_param = saved_model_param.half()
|
|
|
|
assert torch.equal(orig_param.cpu(), saved_model_param)
|