From b77aa718ded3cc27b87efd1f9a488cbfc88dde05 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 1 Nov 2021 22:12:14 +0530 Subject: [PATCH] Changed the model size calculation using `ByteCounter` (#10123) --- CHANGELOG.md | 1 + pytorch_lightning/core/lightning.py | 5 +++++ pytorch_lightning/utilities/memory.py | 27 +++++++++++++++++++-------- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0c28cb752..7bda865002 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -172,6 +172,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed default value of the `max_steps` Trainer argument from `None` to -1 ([#9460](https://github.com/PyTorchLightning/pytorch-lightning/pull/9460)) - LightningModule now raises an error when calling `log(on_step=False, on_epoch=False)` ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227)) - Quantization aware training observers are now disabled by default during validating/testing/predicting stages ([#8540](https://github.com/PyTorchLightning/pytorch-lightning/pull/8540)) +- Changed the model size calculation using `ByteCounter` ([#10123](https://github.com/PyTorchLightning/pytorch-lightning/pull/10123)) - Enabled `on_load_checkpoint` for `LightningDataModule` for all `trainer_fn` ([#10238](https://github.com/PyTorchLightning/pytorch-lightning/pull/10238)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index f1a4213397..c59193859b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1991,6 +1991,11 @@ class LightningModule( @property def model_size(self) -> float: + """Returns the model size in MegaBytes (MB) + + Note: + This property will not return correct value for Deepspeed (stage 3) and fully-sharded training. + """ rank_zero_deprecation( "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7." " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.", diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 717e8b5e44..e810c70b34 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -16,7 +16,6 @@ import gc import os import shutil import subprocess -import uuid from typing import Any, Dict import torch @@ -25,6 +24,20 @@ from torch.nn import Module from pytorch_lightning.utilities.apply_func import apply_to_collection +class ByteCounter: + """Accumulate and stores the total bytes of an object.""" + + def __init__(self) -> None: + self.nbytes: int = 0 + + def write(self, data: bytes) -> None: + """Stores the total bytes of the data.""" + self.nbytes += len(data) + + def flush(self) -> None: + pass + + def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: """Detach all tensors in `in_dict`. @@ -163,17 +176,15 @@ def get_gpu_memory_map() -> Dict[str, float]: def get_model_size_mb(model: Module) -> float: - """Calculates the size of a Module in megabytes by saving the model to a temporary file and reading its size. + """Calculates the size of a Module in megabytes. The computation includes everything in the :meth:`~torch.nn.Module.state_dict`, - i.e., by default the parameteters and buffers. + i.e., by default the parameters and buffers. Returns: Number of megabytes in the parameters of the input module. """ - # TODO: Implement a method without needing to download the model - tmp_name = f"{uuid.uuid4().hex}.pt" - torch.save(model.state_dict(), tmp_name) - size_mb = os.path.getsize(tmp_name) / 1e6 - os.remove(tmp_name) + model_size = ByteCounter() + torch.save(model.state_dict(), model_size) + size_mb = model_size.nbytes / 1e6 return size_mb