# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from unittest import mock

import pytest
import torch
from torch import nn
from torch.optim.swa_utils import SWALR
from torch.utils.data import DataLoader

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import StochasticWeightAveraging
from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
from pytorch_lightning.strategies import DDPSpawnStrategy, Strategy
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests_pytorch.helpers.datasets import RandomIterableDataset
from tests_pytorch.helpers.runif import RunIf


class SwaTestModel(BoringModel):
    def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False):
        super().__init__()
        layers = [nn.Linear(32, 32)]
        if batchnorm:
            layers.append(nn.BatchNorm1d(32))
        layers += [nn.ReLU(), nn.Linear(32, 2)]
        self.layer = nn.Sequential(*layers)
        self.interval = interval
        self.iterable_dataset = iterable_dataset

    def training_step(self, batch, batch_idx):
        output = self.forward(batch)
        loss = self.loss(batch, output)
        return {"loss": loss}

    def train_dataloader(self):

        dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset
        dset = dset_cls(32, 64)

        return DataLoader(dset, batch_size=2)

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": torch.optim.lr_scheduler.StepLR(optimizer, step_size=1),
                "interval": self.interval,
            },
        }


class SwaTestCallback(StochasticWeightAveraging):
    update_parameters_calls: int = 0
    transfer_weights_calls: int = 0

    def update_parameters(self, *args, **kwargs):
        self.update_parameters_calls += 1
        return StochasticWeightAveraging.update_parameters(*args, **kwargs)

    def transfer_weights(self, *args, **kwargs):
        self.transfer_weights_calls += 1
        return StochasticWeightAveraging.transfer_weights(*args, **kwargs)

    def on_train_epoch_start(self, trainer, *args):
        super().on_train_epoch_start(trainer, *args)
        assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end)
        if self.swa_start <= trainer.current_epoch:
            assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR)
            assert trainer.lr_scheduler_configs[0].interval == "epoch"
            assert trainer.lr_scheduler_configs[0].frequency == 1

    def on_train_epoch_end(self, trainer, *args):
        super().on_train_epoch_end(trainer, *args)
        if self.swa_start <= trainer.current_epoch <= self.swa_end:
            swa_epoch = trainer.current_epoch - self.swa_start
            assert self.n_averaged == swa_epoch + 1
            # Scheduler is stepped once on initialization and then at the end of each epoch
            assert self._swa_scheduler._step_count == swa_epoch + 2
        elif trainer.current_epoch > self.swa_end:
            assert self.n_averaged == self._max_epochs - self.swa_start

    def on_train_end(self, trainer, pl_module):
        super().on_train_end(trainer, pl_module)

        # make sure these are correctly set again
        assert not trainer.fit_loop._skip_backward
        assert trainer.accumulate_grad_batches == 2
        assert trainer.num_training_batches == 5

        if not isinstance(trainer.strategy, DDPSpawnStrategy):
            # check backward call count. the batchnorm update epoch should not backward
            assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches

        # check call counts
        assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1)
        assert self.transfer_weights_calls == 1


def train_with_swa(
    tmpdir,
    batchnorm=True,
    strategy=None,
    accelerator="cpu",
    devices=1,
    interval="epoch",
    iterable_dataset=False,
):
    model = SwaTestModel(batchnorm=batchnorm, interval=interval, iterable_dataset=iterable_dataset)
    swa_start = 2
    max_epochs = 5
    swa_callback = SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1)
    assert swa_callback.update_parameters_calls == 0
    assert swa_callback.transfer_weights_calls == 0

    trainer = Trainer(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        enable_model_summary=False,
        max_epochs=max_epochs,
        limit_train_batches=5,
        limit_val_batches=0,
        callbacks=[swa_callback],
        accumulate_grad_batches=2,
        strategy=strategy,
        accelerator=accelerator,
        devices=devices,
    )

    with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward):
        trainer.fit(model)

    # check the model is the expected
    assert trainer.lightning_module == model


@RunIf(min_cuda_gpus=2, standalone=True)
def test_swa_callback_ddp(tmpdir):
    train_with_swa(tmpdir, strategy="ddp", accelerator="gpu", devices=2)


@RunIf(min_cuda_gpus=2)
def test_swa_callback_ddp_spawn(tmpdir):
    train_with_swa(tmpdir, strategy="ddp_spawn", accelerator="gpu", devices=2)


@RunIf(skip_windows=True)
def test_swa_callback_ddp_cpu(tmpdir):
    train_with_swa(tmpdir, strategy="ddp_spawn", accelerator="cpu", devices=2)


@pytest.mark.parametrize(
    "accelerator", [pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), pytest.param("mps", marks=RunIf(mps=True))]
)
def test_swa_callback_1_gpu(tmpdir, accelerator):
    train_with_swa(tmpdir, accelerator=accelerator, devices=1)


@pytest.mark.parametrize("batchnorm", (True, False))
@pytest.mark.parametrize("iterable_dataset", (True, False))
def test_swa_callback(tmpdir, batchnorm: bool, iterable_dataset: bool):
    train_with_swa(tmpdir, batchnorm=batchnorm, iterable_dataset=iterable_dataset)


@pytest.mark.parametrize("interval", ("epoch", "step"))
def test_swa_callback_scheduler_step(tmpdir, interval: str):
    train_with_swa(tmpdir, interval=interval)


def test_swa_warns(tmpdir, caplog):
    model = SwaTestModel(interval="step")
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, callbacks=StochasticWeightAveraging(swa_lrs=1e-2))
    with caplog.at_level(level=logging.INFO), pytest.warns(UserWarning, match="SWA is currently only supported"):
        trainer.fit(model)
    assert "Swapping scheduler `StepLR` for `SWALR`" in caplog.text


def test_swa_raises():
    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
        StochasticWeightAveraging(swa_epoch_start=0, swa_lrs=0.1)
    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
        StochasticWeightAveraging(swa_epoch_start=1.5, swa_lrs=0.1)
    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
        StochasticWeightAveraging(swa_epoch_start=-1, swa_lrs=0.1)
    with pytest.raises(MisconfigurationException, match="positive float, or a list of positive floats"):
        StochasticWeightAveraging(swa_epoch_start=5, swa_lrs=[0.2, 1])


def test_swa_deepcopy(tmpdir):
    """Test to ensure SWA Callback doesn't deepcopy dataloaders and datamodule potentially leading to OOM."""

    class TestSWA(StochasticWeightAveraging):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.setup_called = False

        def setup(self, trainer, pl_module, stage) -> None:
            super().setup(trainer, pl_module, stage)
            assert self._average_model.train_dataloader is not pl_module.train_dataloader
            assert self._average_model.train_dataloader.__self__ == self._average_model
            assert self._average_model.trainer is None
            self.setup_called = True

    model = BoringModel()
    swa = TestSWA(swa_lrs=1e-2)
    trainer = Trainer(default_root_dir=tmpdir, callbacks=swa, fast_dev_run=True)
    trainer.fit(model, train_dataloaders=DataLoader(RandomDataset(32, 2)))
    assert swa.setup_called


def test_swa_multiple_lrs(tmpdir):
    swa_lrs = [0.123, 0.321]

    class TestModel(BoringModel):
        def __init__(self):
            super(BoringModel, self).__init__()
            self.layer1 = torch.nn.Linear(32, 32)
            self.layer2 = torch.nn.Linear(32, 2)

        def forward(self, x):
            x = self.layer1(x)
            x = self.layer2(x)
            return x

        def configure_optimizers(self):
            params = [{"params": self.layer1.parameters(), "lr": 0.1}, {"params": self.layer2.parameters(), "lr": 0.2}]
            return torch.optim.Adam(params)

        def on_train_epoch_start(self):
            optimizer = trainer.optimizers[0]
            assert [pg["lr"] for pg in optimizer.param_groups] == [0.1, 0.2]
            assert [pg["initial_lr"] for pg in optimizer.param_groups] == swa_lrs
            assert [pg["swa_lr"] for pg in optimizer.param_groups] == swa_lrs
            self.on_train_epoch_start_called = True

    model = TestModel()
    swa_callback = StochasticWeightAveraging(swa_lrs=swa_lrs)
    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=swa_callback,
        fast_dev_run=1,
    )
    trainer.fit(model)
    assert model.on_train_epoch_start_called