# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from unittest import mock

import pytest
import torch

from pytorch_lightning import Trainer
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
from tests_pytorch.helpers.runif import RunIf

if _TORCH_GREATER_EQUAL_1_12:
    torch_test_assert_close = torch.testing.assert_close
else:
    torch_test_assert_close = torch.testing.assert_allclose


class MyNativeAMP(NativeMixedPrecisionPlugin):
    pass


class MyApexPlugin(ApexMixedPrecisionPlugin):
    pass


@mock.patch.dict(
    os.environ,
    {
        "CUDA_VISIBLE_DEVICES": "0,1",
        "SLURM_NTASKS": "2",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_NODEID": "0",
        "LOCAL_RANK": "0",
        "SLURM_PROCID": "0",
        "SLURM_LOCALID": "0",
    },
)
@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
@pytest.mark.parametrize("strategy,devices", [("ddp", 2), ("ddp_spawn", 2)])
@pytest.mark.parametrize(
    "amp,custom_plugin,plugin_cls",
    [
        ("native", False, NativeMixedPrecisionPlugin),
        ("native", True, MyNativeAMP),
        pytest.param("apex", False, ApexMixedPrecisionPlugin, marks=RunIf(amp_apex=True)),
        pytest.param("apex", True, MyApexPlugin, marks=RunIf(amp_apex=True)),
    ],
)
def test_amp_apex_ddp(mocked_device_count, mocked_is_available, strategy, devices, amp, custom_plugin, plugin_cls):
    plugin = None
    if custom_plugin:
        plugin = plugin_cls(16, "cpu") if amp == "native" else plugin_cls()
    trainer = Trainer(
        fast_dev_run=True,
        precision=16,
        amp_backend=amp,
        accelerator="gpu",
        devices=devices,
        strategy=strategy,
        plugins=plugin,
    )
    assert isinstance(trainer.precision_plugin, plugin_cls)


class TestClippingOptimizer(torch.optim.SGD):
    def step(self, *args, pl_module=None):
        pl_module.check_grads_clipped()
        return super().step(*args)


class TestPrecisionModel(BoringModel):
    # sister test: tests/trainer/optimization/test_manual_optimization.py::test_multiple_optimizers_step
    def on_after_backward(self) -> None:
        # check grads are scaled
        scale = self.trainer.precision_plugin.scaler.get_scale()
        assert scale != 1.0  # the return value if not enabled
        grads = [p.grad for p in self.parameters()]
        inv_scale = 1 / scale
        self.original_grads = [p * inv_scale for p in grads]

    def check_grads_unscaled(self, optimizer=None):
        if optimizer is not None:
            scaler = self.trainer.precision_plugin.scaler
            state = scaler._per_optimizer_states[id(optimizer)]
            assert state["stage"].name == "UNSCALED"

        grads = [p.grad for p in self.parameters()]
        assert len(grads) == len(self.original_grads)
        for actual, expected in zip(grads, self.original_grads):
            torch_test_assert_close(actual, expected, equal_nan=True)

    def check_grads_clipped(self):
        parameters = list(self.parameters())
        assert len(parameters) == len(self.clipped_parameters)
        for actual, expected in zip(parameters, self.clipped_parameters):
            torch_test_assert_close(actual.grad, expected.grad, equal_nan=True)

    def on_before_optimizer_step(self, optimizer, *_):
        self.check_grads_unscaled(optimizer)
        # manually clip
        self.clipped_parameters = []
        for p in self.parameters():
            copy = p.detach().clone()
            copy.grad = p.grad.clone()
            self.clipped_parameters.append(copy)
        clip_val = self.trainer.gradient_clip_val
        torch.nn.utils.clip_grad_value_(self.clipped_parameters, clip_val)

    def log_grad_norm(self, grad_norm_dict):
        self.check_grads_unscaled()
        assert len(grad_norm_dict)

    def configure_gradient_clipping(self, *args, **kwargs):
        # let lightning clip
        super().configure_gradient_clipping(*args, **kwargs)
        # check clipping worked as expected
        self.check_grads_clipped()

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, closure, **_):
        # pass self as a kwarg
        optimizer.step(closure, pl_module=self)

    def configure_optimizers(self):
        return TestClippingOptimizer(self.layer.parameters(), lr=0.1)


@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("accum", [1, 2])
def test_amp_gradient_unscale(tmpdir, accum: int):
    model = TestPrecisionModel()

    trainer = Trainer(
        max_epochs=2,
        default_root_dir=tmpdir,
        limit_train_batches=2,
        limit_val_batches=0,
        amp_backend="native",
        strategy="ddp_spawn",
        accelerator="gpu",
        devices=2,
        precision=16,
        track_grad_norm=2,
        # use a tiny value to make sure it works
        gradient_clip_val=1e-3,
        gradient_clip_algorithm="value",
        log_every_n_steps=1,
        accumulate_grad_batches=accum,
        enable_progress_bar=False,
    )
    trainer.fit(model)


@RunIf(min_cuda_gpus=1)
def test_amp_skip_optimizer(tmpdir):
    """Test that optimizers can be skipped when using amp."""

    class CustomBoringModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.layer1 = torch.nn.Linear(32, 32)
            self.layer2 = torch.nn.Linear(32, 2)

        def forward(self, x: torch.Tensor):
            x = self.layer1(x)
            x = self.layer2(x)
            return x

        def training_step(self, batch, batch_idx, optimizer_idx):
            if optimizer_idx == 1:
                return None
            output = self(batch)
            return self.loss(batch, output)

        def configure_optimizers(self):
            return [
                torch.optim.SGD(self.layer1.parameters(), lr=0.1),
                torch.optim.SGD(self.layer2.parameters(), lr=0.1),
            ]

    trainer = Trainer(
        default_root_dir=tmpdir, accelerator="gpu", devices=1, fast_dev_run=1, amp_backend="native", precision=16
    )
    model = CustomBoringModel()
    trainer.fit(model)


@RunIf(min_cuda_gpus=2, amp_apex=True, standalone=True)
@pytest.mark.parametrize("amp_level", ["O2"])
def test_amp_apex_ddp_fit(amp_level, tmpdir):
    class CustomBoringModel(BoringModel):
        def training_step(self, batch, batch_idx):
            assert self.layer.weight.dtype == torch.float16
            assert self.trainer.precision_plugin._connected
            return super().training_step(batch, batch_idx)

    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        precision=16,
        amp_backend="apex",
        accelerator="gpu",
        devices=2,
        strategy="ddp",
        plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
        enable_progress_bar=False,
        enable_model_summary=False,
    )
    assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
    model = CustomBoringModel()
    trainer.fit(model)
    trainer.test(model)


@RunIf(min_cuda_gpus=2, amp_apex=True)
@pytest.mark.parametrize("amp_level", ["O2"])
def test_amp_apex_ddp_spawn_fit(amp_level, tmpdir):
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        precision=16,
        amp_backend="apex",
        accelerator="gpu",
        devices=2,
        strategy="ddp_spawn",
        plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
    )
    assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
    model = BoringModel()
    trainer.fit(model)


@RunIf(min_torch="1.10")
def test_cpu_amp_precision_context_manager(tmpdir):
    """Test to ensure that the context manager correctly is set to CPU + bfloat16."""
    plugin = NativeMixedPrecisionPlugin("bf16", "cpu")
    assert plugin.device == "cpu"
    assert plugin.scaler is None
    context_manager = plugin.autocast_context_manager()
    assert isinstance(context_manager, torch.autocast)
    # check with str due to a bug upstream: https://github.com/pytorch/pytorch/issues/65786
    assert str(context_manager.fast_dtype) == str(torch.bfloat16)


def test_precision_selection_raises(monkeypatch):
    with pytest.raises(
        MisconfigurationException, match=r"precision=16, amp_type='apex'\)` but apex AMP not supported on CPU"
    ):
        Trainer(amp_backend="apex", precision=16)

    import pytorch_lightning.plugins.precision.native_amp as amp

    monkeypatch.setattr(amp, "_TORCH_GREATER_EQUAL_1_10", False)
    with pytest.warns(
        UserWarning, match=r"precision=16\)` but native AMP is not supported on CPU. Using `precision='bf16"
    ), pytest.raises(MisconfigurationException, match="must install torch greater or equal to 1.10"):
        Trainer(precision=16)

    with pytest.raises(MisconfigurationException, match="must install torch greater or equal to 1.10"):
        Trainer(precision="bf16")

    with pytest.raises(MisconfigurationException, match=r"amp_type='apex', precision='bf16'\)` but it's not supported"):
        Trainer(amp_backend="apex", precision="bf16")

    with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), pytest.raises(
        MisconfigurationException, match="Sharded plugins are not supported with apex"
    ):
        with mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True):
            Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1, strategy="ddp_fully_sharded")

    import pytorch_lightning.plugins.precision.apex_amp as apex

    monkeypatch.setattr(apex, "_APEX_AVAILABLE", False)
    with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch(
        "lightning_lite.utilities.device_parser.is_cuda_available", return_value=True
    ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but `apex` is not installed"):
        Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1)