lightning/pytorch_lightning/trainer/progress.py

272 lines
8.8 KiB
Python
Raw Normal View History

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import asdict, dataclass, field
from typing import Type
@dataclass
class BaseProgress:
"""Mixin that implements state-loading utilities for dataclasses."""
2021-09-03 00:15:09 +00:00
def state_dict(self) -> dict:
return asdict(self)
def load_state_dict(self, state_dict: dict) -> None:
self.__dict__.update(state_dict)
@classmethod
def from_state_dict(cls, state_dict: dict) -> "BaseProgress":
obj = cls()
obj.load_state_dict(state_dict)
return obj
@dataclass
class ReadyCompletedTracker(BaseProgress):
"""Track an event's progress.
Args:
ready: Intended to track the number of events ready to start.
completed: Intended to be incremented after the event completes (e.g. after ``on_*_end`` runs).
2021-09-03 00:15:09 +00:00
These attributes should be increased in order, that is, :attr:`ready` first and :attr:`completed` last.
"""
ready: int = 0
completed: int = 0
def reset(self) -> None:
"""Reset the state."""
self.ready = 0
self.completed = 0
def reset_on_restart(self) -> None:
"""Reset the progress on restart.
If there is a failure before all attributes are increased, restore the attributes to the last fully completed
value.
2021-09-03 00:15:09 +00:00
"""
self.ready = self.completed
@dataclass
class StartedTracker(ReadyCompletedTracker):
"""Track an event's progress.
Args:
ready: Intended to track the number of events ready to start.
started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).
completed: Intended to be incremented after the event completes (e.g. after ``on_*_end`` runs).
These attributes should be increased in order, that is, :attr:`ready` first and :attr:`completed` last.
"""
started: int = 0
def reset(self) -> None:
super().reset()
self.started = 0
def reset_on_restart(self) -> None:
super().reset_on_restart()
self.started = self.completed
@dataclass
class ProcessedTracker(StartedTracker):
"""Track an event's progress.
Args:
ready: Intended to track the number of events ready to start.
started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).
processed: Intended to be incremented after the event is processed.
completed: Intended to be incremented after the event completes (e.g. after ``on_*_end`` runs).
These attributes should be increased in order, that is, :attr:`ready` first and :attr:`completed` last.
"""
processed: int = 0
def reset(self) -> None:
super().reset()
self.processed = 0
def reset_on_restart(self) -> None:
super().reset_on_restart()
self.processed = self.completed
@dataclass
class Progress(BaseProgress):
"""Track aggregated and current progress.
Args:
2021-09-03 00:15:09 +00:00
total: Intended to track the total progress of an event.
current: Intended to track the current progress of an event.
"""
total: ReadyCompletedTracker = field(default_factory=ProcessedTracker)
current: ReadyCompletedTracker = field(default_factory=ProcessedTracker)
def __post_init__(self) -> None:
if type(self.total) is not type(self.current): # noqa: E721
raise ValueError("The `total` and `current` instances should be of the same class")
def increment_ready(self) -> None:
self.total.ready += 1
self.current.ready += 1
def increment_started(self) -> None:
if not isinstance(self.total, StartedTracker):
raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `started` attribute")
self.total.started += 1
self.current.started += 1
def increment_processed(self) -> None:
if not isinstance(self.total, ProcessedTracker):
raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `processed` attribute")
self.total.processed += 1
self.current.processed += 1
def increment_completed(self) -> None:
self.total.completed += 1
self.current.completed += 1
@classmethod
def from_defaults(cls, tracker_cls: Type[ReadyCompletedTracker], **kwargs: int) -> "Progress":
2021-09-03 00:15:09 +00:00
"""Utility function to easily create an instance from keyword arguments to both ``Tracker``s."""
return cls(total=tracker_cls(**kwargs), current=tracker_cls(**kwargs))
def reset_on_epoch(self) -> None:
self.current.reset()
def reset_on_run(self) -> None:
self.current.reset()
def reset_on_restart(self) -> None:
self.current.reset_on_restart()
def load_state_dict(self, state_dict: dict) -> None:
self.total.load_state_dict(state_dict["total"])
self.current.load_state_dict(state_dict["current"])
Add progress tracking on Loops - 2/n (#8362) * resolve issues * update * update * update * add more exceptions * resolve bug * update * update * update changelog * resolve bug * resolve comments * update * update * update changelog * update * update * remove space * update * add progress tracking to loops * validate json * update * convert to dict for better readability * validate reload * update * update * update on comments * remove deadcode * clean changelog * clean changelog * update * update on comments * CHANGELOG * CHANGELOG * Update pytorch_lightning/loops/base.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * whitespace suggestions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make fault_tolerant_enabled protected * whitespace fixes around Args * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * typo it's -> its * fix copy-paste typo in progress docstring * Delete classes * Minor change * docs * protected get_loops_state * merge restore_loops with restore_progress * Fix tests after removals * explicit save with trainer.save_checkpoint() * handle optimization restart based on optimizer_idx * update increments * update val batch progress and remove iteration count * update progress tracking for dataloader loops * remove self.dataloader_idx from eval_epoch_loop * add batch progress to predict loop * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * incorporate progress tracking for current_epoch * Fix test * Actually remove it * Remove unused TrainingEpochProgress * Fix optimization progress - missing scheduler * Restarting changes * Scheduler progress * Unused property, reset on epoch * Resolve FIXME * Remove FIXME * fix test_progress (wip) * fix batch_progress.current.reset * Hold off on split progress. Out of scope of this PR * Unnecessary if * fix structure in test_progress * structure * clean up unused variables in test_progress * refactor naming and organization in test_progress * Unnecessary variable * Remove unnecessary diff * Improve comment * Undo typing change to avoid polluting everything with mypy fixes * Fix and improve test_loops.py * Fix and organize `test_loop_state_dict` * Remove unnecessary checks in test * Update test after disallowing updates on None attributes * Typing * Minor test cleanup * Fix and move loop test * Move test from progress to loops * Reset the scheduler progress * SchedulerProgress fix * Consistent whitespace * Fix final test * Minor test changes * One test to rule them all * Formatting * Rename and clean variables * Shorter names * Shorter scheduler name * Fix optimizer step calculation for stop_batch=2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove empty connects * Update CHANGELOG * Holy shit finally got the formula right * Fix final thing!!! * Do not check state dicts * parametrize multiple_dataloader progress test * Update CHANGELOG.md Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Justus Schock <justus.schock@posteo.de>
2021-07-19 08:31:45 +00:00
@dataclass
class DataLoaderProgress(Progress):
"""Tracks dataloader progress.
These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
Args:
2021-09-03 00:15:09 +00:00
total: Tracks the total dataloader progress.
current: Tracks the current dataloader progress.
"""
total: ReadyCompletedTracker = field(default_factory=ReadyCompletedTracker)
current: ReadyCompletedTracker = field(default_factory=ReadyCompletedTracker)
Add progress tracking on Loops - 2/n (#8362) * resolve issues * update * update * update * add more exceptions * resolve bug * update * update * update changelog * resolve bug * resolve comments * update * update * update changelog * update * update * remove space * update * add progress tracking to loops * validate json * update * convert to dict for better readability * validate reload * update * update * update on comments * remove deadcode * clean changelog * clean changelog * update * update on comments * CHANGELOG * CHANGELOG * Update pytorch_lightning/loops/base.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * whitespace suggestions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make fault_tolerant_enabled protected * whitespace fixes around Args * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * typo it's -> its * fix copy-paste typo in progress docstring * Delete classes * Minor change * docs * protected get_loops_state * merge restore_loops with restore_progress * Fix tests after removals * explicit save with trainer.save_checkpoint() * handle optimization restart based on optimizer_idx * update increments * update val batch progress and remove iteration count * update progress tracking for dataloader loops * remove self.dataloader_idx from eval_epoch_loop * add batch progress to predict loop * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * incorporate progress tracking for current_epoch * Fix test * Actually remove it * Remove unused TrainingEpochProgress * Fix optimization progress - missing scheduler * Restarting changes * Scheduler progress * Unused property, reset on epoch * Resolve FIXME * Remove FIXME * fix test_progress (wip) * fix batch_progress.current.reset * Hold off on split progress. Out of scope of this PR * Unnecessary if * fix structure in test_progress * structure * clean up unused variables in test_progress * refactor naming and organization in test_progress * Unnecessary variable * Remove unnecessary diff * Improve comment * Undo typing change to avoid polluting everything with mypy fixes * Fix and improve test_loops.py * Fix and organize `test_loop_state_dict` * Remove unnecessary checks in test * Update test after disallowing updates on None attributes * Typing * Minor test cleanup * Fix and move loop test * Move test from progress to loops * Reset the scheduler progress * SchedulerProgress fix * Consistent whitespace * Fix final test * Minor test changes * One test to rule them all * Formatting * Rename and clean variables * Shorter names * Shorter scheduler name * Fix optimizer step calculation for stop_batch=2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove empty connects * Update CHANGELOG * Holy shit finally got the formula right * Fix final thing!!! * Do not check state dicts * parametrize multiple_dataloader progress test * Update CHANGELOG.md Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Justus Schock <justus.schock@posteo.de>
2021-07-19 08:31:45 +00:00
@dataclass
class BatchProgress(Progress):
"""Tracks batch progress.
These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
Args:
total: Tracks the total dataloader progress.
current: Tracks the current dataloader progress.
is_last_batch: Whether the batch is the last one. This is useful for iterable datasets.
"""
is_last_batch: bool = False
def reset_on_run(self) -> None:
super().reset_on_run()
self.is_last_batch = False
def load_state_dict(self, state_dict: dict) -> None:
super().load_state_dict(state_dict)
self.is_last_batch = state_dict["is_last_batch"]
@dataclass
Add progress tracking on Loops - 2/n (#8362) * resolve issues * update * update * update * add more exceptions * resolve bug * update * update * update changelog * resolve bug * resolve comments * update * update * update changelog * update * update * remove space * update * add progress tracking to loops * validate json * update * convert to dict for better readability * validate reload * update * update * update on comments * remove deadcode * clean changelog * clean changelog * update * update on comments * CHANGELOG * CHANGELOG * Update pytorch_lightning/loops/base.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * whitespace suggestions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make fault_tolerant_enabled protected * whitespace fixes around Args * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * typo it's -> its * fix copy-paste typo in progress docstring * Delete classes * Minor change * docs * protected get_loops_state * merge restore_loops with restore_progress * Fix tests after removals * explicit save with trainer.save_checkpoint() * handle optimization restart based on optimizer_idx * update increments * update val batch progress and remove iteration count * update progress tracking for dataloader loops * remove self.dataloader_idx from eval_epoch_loop * add batch progress to predict loop * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * incorporate progress tracking for current_epoch * Fix test * Actually remove it * Remove unused TrainingEpochProgress * Fix optimization progress - missing scheduler * Restarting changes * Scheduler progress * Unused property, reset on epoch * Resolve FIXME * Remove FIXME * fix test_progress (wip) * fix batch_progress.current.reset * Hold off on split progress. Out of scope of this PR * Unnecessary if * fix structure in test_progress * structure * clean up unused variables in test_progress * refactor naming and organization in test_progress * Unnecessary variable * Remove unnecessary diff * Improve comment * Undo typing change to avoid polluting everything with mypy fixes * Fix and improve test_loops.py * Fix and organize `test_loop_state_dict` * Remove unnecessary checks in test * Update test after disallowing updates on None attributes * Typing * Minor test cleanup * Fix and move loop test * Move test from progress to loops * Reset the scheduler progress * SchedulerProgress fix * Consistent whitespace * Fix final test * Minor test changes * One test to rule them all * Formatting * Rename and clean variables * Shorter names * Shorter scheduler name * Fix optimizer step calculation for stop_batch=2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove empty connects * Update CHANGELOG * Holy shit finally got the formula right * Fix final thing!!! * Do not check state dicts * parametrize multiple_dataloader progress test * Update CHANGELOG.md Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Justus Schock <justus.schock@posteo.de>
2021-07-19 08:31:45 +00:00
class SchedulerProgress(Progress):
"""Tracks scheduler progress.
These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
Args:
2021-09-03 00:15:09 +00:00
total: Tracks the total scheduler progress.
current: Tracks the current scheduler progress.
"""
total: ReadyCompletedTracker = field(default_factory=ReadyCompletedTracker)
current: ReadyCompletedTracker = field(default_factory=ReadyCompletedTracker)
@dataclass
class OptimizerProgress(BaseProgress):
"""Track optimizer progress.
Args:
step: Tracks ``optimizer.step`` calls.
zero_grad: Tracks ``optimizer.zero_grad`` calls.
"""
step: Progress = field(default_factory=lambda: Progress.from_defaults(ReadyCompletedTracker))
zero_grad: Progress = field(default_factory=lambda: Progress.from_defaults(StartedTracker))
def reset_on_run(self) -> None:
self.step.reset_on_run()
self.zero_grad.reset_on_run()
def reset_on_restart(self) -> None:
self.step.reset_on_restart()
self.zero_grad.reset_on_restart()
def load_state_dict(self, state_dict: dict) -> None:
self.step.load_state_dict(state_dict["step"])
self.zero_grad.load_state_dict(state_dict["zero_grad"])
@dataclass
class OptimizationProgress(BaseProgress):
"""Track optimization progress.
Args:
optimizer: Tracks optimizer progress.
optimizer_position: The index of the current optimizer amongst the currently active optimizers.
Used to know which optimizer we were using when restarting.
Since not all optimizers may be active at a given time, this index is different from the ``optimizer_idx``
seen in the optimization loops.
"""
# TODO: support for multiple optimizers
optimizer: OptimizerProgress = field(default_factory=OptimizerProgress)
optimizer_position: int = 0
@property
def optimizer_steps(self) -> int:
return self.optimizer.step.total.completed
def reset_on_run(self) -> None:
self.optimizer.reset_on_run()
def reset_on_restart(self) -> None:
self.optimizer.reset_on_restart()
def load_state_dict(self, state_dict: dict) -> None:
self.optimizer.load_state_dict(state_dict["optimizer"])
self.optimizer_position = state_dict["optimizer_position"]