lightning/pytorch_lightning/tuner/tuning.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pytorch_lightning.tuner.batch_size_scaling import scale_batch_size
from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
from pytorch_lightning.tuner.lr_finder import _run_lr_finder_internally, lr_find
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.core.datamodule import LightningDataModule
from typing import Optional, List, Union
from torch.utils.data import DataLoader


class Tuner:

    def __init__(self, trainer):
        self.trainer = trainer

    def on_trainer_init(self, auto_lr_find, auto_scale_batch_size):
        self.trainer.auto_lr_find = auto_lr_find
        self.trainer.auto_scale_batch_size = auto_scale_batch_size

    def tune(self, model, train_dataloader, val_dataloaders, datamodule):
        # setup data, etc...
        self.trainer.train_loop.setup_fit(model, train_dataloader, val_dataloaders, datamodule)

        # hook
        self.trainer.data_connector.prepare_data(model)

        # Run auto batch size scaling
        if self.trainer.auto_scale_batch_size:
            if isinstance(self.trainer.auto_scale_batch_size, bool):
                self.trainer.auto_scale_batch_size = 'power'
            self.scale_batch_size(
                model,
                mode=self.trainer.auto_scale_batch_size,
                train_dataloader=train_dataloader,
                val_dataloaders=val_dataloaders,
                datamodule=datamodule,
            )
            model.logger = self.trainer.logger  # reset logger binding

        # Run learning rate finder:
        if self.trainer.auto_lr_find:
            self.internal_find_lr(self.trainer, model)
            model.logger = self.trainer.logger  # reset logger binding

    def scale_batch_size(self,
                         model,
                         mode: str = 'power',
                         steps_per_trial: int = 3,
                         init_val: int = 2,
                         max_trials: int = 25,
                         batch_arg_name: str = 'batch_size',
                         **fit_kwargs):
        r"""
        Will iteratively try to find the largest batch size for a given model
        that does not give an out of memory (OOM) error.

        Args:
            model: Model to fit.

            mode: string setting the search mode. Either `power` or `binsearch`.
                If mode is `power` we keep multiplying the batch size by 2, until
                we get an OOM error. If mode is 'binsearch', we will initially
                also keep multiplying by 2 and after encountering an OOM error
                do a binary search between the last successful batch size and the
                batch size that failed.

            steps_per_trial: number of steps to run with a given batch size.
                Idealy 1 should be enough to test if a OOM error occurs,
                however in practise a few are needed

            init_val: initial batch size to start the search with

            max_trials: max number of increase in batch size done before
               algorithm is terminated

            batch_arg_name: name of the attribute that stores the batch size.
                It is expected that the user has provided a model or datamodule that has a hyperparameter
                with that name. We will look for this attribute name in the following places

                - `model`
                - `model.hparams`
                - `model.datamodule`
                - `trainer.datamodule` (the datamodule passed to the tune method)

            **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader
                or datamodule.

        """
        return scale_batch_size(
            self.trainer, model, mode, steps_per_trial, init_val, max_trials, batch_arg_name, **fit_kwargs
        )

    def lr_find(
            self,
            model: LightningModule,
            train_dataloader: Optional[DataLoader] = None,
            val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
            min_lr: float = 1e-8,
            max_lr: float = 1,
            num_training: int = 100,
            mode: str = 'exponential',
            early_stop_threshold: float = 4.0,
            datamodule: Optional[LightningDataModule] = None
    ):
        return lr_find(
            self.trainer,
            model,
            train_dataloader,
            val_dataloaders,
            min_lr,
            max_lr,
            num_training,
            mode,
            early_stop_threshold,
            datamodule,
        )

    def internal_find_lr(self, trainer, model: LightningModule):
        return _run_lr_finder_internally(trainer, model)

    def pick_multiple_gpus(self, num_gpus: int):
        return pick_multiple_gpus(num_gpus)