lightning/tests/plugins/test_amp_plugin.py

import os
from unittest import mock

import pytest
import torch

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE
from tests.base.boring_model import BoringModel


@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
@mock.patch.dict(os.environ, {
    "CUDA_VISIBLE_DEVICES": "0,1",
    "SLURM_NTASKS": "2",
    "SLURM_JOB_NAME": "SOME_NAME",
    "SLURM_NODEID": "0",
    "LOCAL_RANK": "0",
    "SLURM_LOCALID": "0"
})
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.precision_connector.backend, NativeAMPPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        precision=16,
        amp_backend='native',
        gpus=gpus,
        num_processes=num_processes,
        distributed_backend=ddp_backend,
        callbacks=[CB()]
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)


@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
@mock.patch.dict(os.environ, {
    "CUDA_VISIBLE_DEVICES": "0,1",
    "SLURM_NTASKS": "2",
    "SLURM_JOB_NAME": "SOME_NAME",
    "SLURM_NODEID": "0",
    "LOCAL_RANK": "0",
    "SLURM_LOCALID": "0"
})
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
    class MyNativeAMP(NativeAMPPlugin):
        pass

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.precision_connector.backend, MyNativeAMP)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        precision=16,
        amp_backend='native',
        gpus=gpus,
        num_processes=num_processes,
        distributed_backend=ddp_backend,
        plugins=[MyNativeAMP()],
        callbacks=[CB()]
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)


class GradientUnscaleBoringModel(BoringModel):
    def on_after_backward(self):
        norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
        if not (torch.isinf(norm) or torch.isnan(norm)):
            assert norm.item() < 15.


@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_amp_gradient_unscale(tmpdir):
    model = GradientUnscaleBoringModel()

    trainer = Trainer(
        max_epochs=2,
        default_root_dir=os.getcwd(),
        limit_train_batches=2,
        limit_test_batches=2,
        limit_val_batches=2,
        amp_backend='native',
        distributed_backend='ddp_spawn',
        gpus=2,
        precision=16,
        track_grad_norm=2,
        log_every_n_steps=1
    )
    trainer.fit(model)


class UnscaleAccumulateGradBatchesBoringModel(BoringModel):

    def on_after_backward(self):
        norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
        if not (torch.isinf(norm) or torch.isnan(norm)):
            assert norm.item() < 15.


@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir):
    model = UnscaleAccumulateGradBatchesBoringModel()

    trainer = Trainer(
        max_epochs=2,
        default_root_dir=os.getcwd(),
        limit_train_batches=2,
        limit_test_batches=2,
        limit_val_batches=2,
        amp_backend='native',
        distributed_backend='ddp_spawn',
        gpus=2,
        precision=16,
        track_grad_norm=2,
        log_every_n_steps=1,
        accumulate_grad_batches=2,
    )
    trainer.fit(model)
Enable custom apex and amp plugins (#4355) * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin 2020-10-25 21:11:07 +00:00			`import os`
			`from unittest import mock`
Simplify optimization Logic (#4984) * Rely on ddp plugin for blocking sync behaviour, and skip if we're using manual optimization * debug * Revert "debug" This reverts commit ccca6b6b * Expose manual reduce for automatic optimization * Add input arguments * Enable parity test * clean imports * Expose hook after to ensure we reset * Fix naming * add * fix test * uniformize optimizer logic * resolve test * resovle flake8 * resolve amp bug * update tests * remove bug * remove optimizer_step in accelerators * typo * update lightning optimizer * set doesn't work with ddp_spawn * resolve flake8 * update threshold * ignore pyright * correct codeFactor * remove useless if * remove zer_grad function * simplify step * remove typo * resolve bug * Apply suggestions from code review * update on comments * resolve bugs * remove tests * Update pytorch_lightning/trainer/configuration_validator.py Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * simplify testing * add more tests Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-12-07 12:55:49 +00:00
			`import pytest`
Enable custom apex and amp plugins (#4355) * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin 2020-10-25 21:11:07 +00:00			`import torch`

Simplify optimization Logic (#4984) * Rely on ddp plugin for blocking sync behaviour, and skip if we're using manual optimization * debug * Revert "debug" This reverts commit ccca6b6b * Expose manual reduce for automatic optimization * Add input arguments * Enable parity test * clean imports * Expose hook after to ensure we reset * Fix naming * add * fix test * uniformize optimizer logic * resolve test * resovle flake8 * resolve amp bug * update tests * remove bug * remove optimizer_step in accelerators * typo * update lightning optimizer * set doesn't work with ddp_spawn * resolve flake8 * update threshold * ignore pyright * correct codeFactor * remove useless if * remove zer_grad function * simplify step * remove typo * resolve bug * Apply suggestions from code review * update on comments * resolve bugs * remove tests * Update pytorch_lightning/trainer/configuration_validator.py Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * simplify testing * add more tests Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> 2020-12-07 12:55:49 +00:00			`from pytorch_lightning import Trainer`
			`from pytorch_lightning.callbacks import Callback`
			`from pytorch_lightning.plugins.native_amp import NativeAMPPlugin`
			`from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE`
			`from tests.base.boring_model import BoringModel`

Enable custom apex and amp plugins (#4355) * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin 2020-10-25 21:11:07 +00:00
fix import and typo in AMP (#4871) * fix import and typo * docs * apex * fix * typo 2020-11-26 22:45:52 +00:00			`@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")`
Enable custom apex and amp plugins (#4355) * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin 2020-10-25 21:11:07 +00:00			`@mock.patch.dict(os.environ, {`
			`"CUDA_VISIBLE_DEVICES": "0,1",`
			`"SLURM_NTASKS": "2",`
			`"SLURM_JOB_NAME": "SOME_NAME",`
			`"SLURM_NODEID": "0",`
			`"LOCAL_RANK": "0",`
			`"SLURM_LOCALID": "0"`
			`})`
			`@mock.patch('torch.cuda.device_count', return_value=2)`
			`@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],`
			`[('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])`
			`def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):`

			`class CB(Callback):`
			`def on_fit_start(self, trainer, pl_module):`
			`assert isinstance(trainer.precision_connector.backend, NativeAMPPlugin)`
			`raise SystemExit()`

			`model = BoringModel()`
			`trainer = Trainer(`
			`fast_dev_run=True,`
			`precision=16,`
			`amp_backend='native',`
			`gpus=gpus,`
			`num_processes=num_processes,`
			`distributed_backend=ddp_backend,`
			`callbacks=[CB()]`
			`)`

			`with pytest.raises(SystemExit):`
			`trainer.fit(model)`


fix import and typo in AMP (#4871) * fix import and typo * docs * apex * fix * typo 2020-11-26 22:45:52 +00:00			`@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")`
Enable custom apex and amp plugins (#4355) * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin * enable custom apex, amp plugin 2020-10-25 21:11:07 +00:00			`@mock.patch.dict(os.environ, {`
			`"CUDA_VISIBLE_DEVICES": "0,1",`
			`"SLURM_NTASKS": "2",`
			`"SLURM_JOB_NAME": "SOME_NAME",`
			`"SLURM_NODEID": "0",`
			`"LOCAL_RANK": "0",`
			`"SLURM_LOCALID": "0"`
			`})`
			`@mock.patch('torch.cuda.device_count', return_value=2)`
			`@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],`
			`[('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])`
			`def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):`
			`class MyNativeAMP(NativeAMPPlugin):`
			`pass`

			`class CB(Callback):`
			`def on_fit_start(self, trainer, pl_module):`
			`assert isinstance(trainer.precision_connector.backend, MyNativeAMP)`
			`raise SystemExit()`

			`model = BoringModel()`
			`trainer = Trainer(`
			`fast_dev_run=True,`
			`precision=16,`
			`amp_backend='native',`
			`gpus=gpus,`
			`num_processes=num_processes,`
			`distributed_backend=ddp_backend,`
			`plugins=[MyNativeAMP()],`
			`callbacks=[CB()]`
			`)`

			`with pytest.raises(SystemExit):`
			`trainer.fit(model)`
[BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jeff Yang <ydcjeff@outlook.com> 2020-11-02 16:36:48 +00:00

Switch to PyTorch 1.6 in Drone CI (#4393) * switch to 1.6 * readme * 1.7 * back to normal [ci skip] * horovodrun --verbose * try with apex * add apex test * change base * description * test with 1.7 * back to 1.6 * no gradient_clip_val * re-add gradient_clip_val * no amp * temp skip torch.cuda.amp + horovod test * Apply suggestion from code review Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Fix formatting * ddp * Moved extended model outside of function to prevent pickling issue for drone * typo * resolve bug * extract automatic_automization Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: chaton <thomas@grid.ai> 2020-11-03 18:01:51 +00:00			`class GradientUnscaleBoringModel(BoringModel):`
			`def on_after_backward(self):`
			`norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)`
			`if not (torch.isinf(norm) or torch.isnan(norm)):`
			`assert norm.item() < 15.`


fix import and typo in AMP (#4871) * fix import and typo * docs * apex * fix * typo 2020-11-26 22:45:52 +00:00			`@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")`
[BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jeff Yang <ydcjeff@outlook.com> 2020-11-02 16:36:48 +00:00			`@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")`
			`def test_amp_gradient_unscale(tmpdir):`
Switch to PyTorch 1.6 in Drone CI (#4393) * switch to 1.6 * readme * 1.7 * back to normal [ci skip] * horovodrun --verbose * try with apex * add apex test * change base * description * test with 1.7 * back to 1.6 * no gradient_clip_val * re-add gradient_clip_val * no amp * temp skip torch.cuda.amp + horovod test * Apply suggestion from code review Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Fix formatting * ddp * Moved extended model outside of function to prevent pickling issue for drone * typo * resolve bug * extract automatic_automization Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: chaton <thomas@grid.ai> 2020-11-03 18:01:51 +00:00			`model = GradientUnscaleBoringModel()`
[BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jeff Yang <ydcjeff@outlook.com> 2020-11-02 16:36:48 +00:00
			`trainer = Trainer(`
			`max_epochs=2,`
			`default_root_dir=os.getcwd(),`
			`limit_train_batches=2,`
			`limit_test_batches=2,`
			`limit_val_batches=2,`
			`amp_backend='native',`
			`distributed_backend='ddp_spawn',`
			`gpus=2,`
			`precision=16,`
			`track_grad_norm=2,`
			`log_every_n_steps=1`
			`)`
			`trainer.fit(model)`


Switch to PyTorch 1.6 in Drone CI (#4393) * switch to 1.6 * readme * 1.7 * back to normal [ci skip] * horovodrun --verbose * try with apex * add apex test * change base * description * test with 1.7 * back to 1.6 * no gradient_clip_val * re-add gradient_clip_val * no amp * temp skip torch.cuda.amp + horovod test * Apply suggestion from code review Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Fix formatting * ddp * Moved extended model outside of function to prevent pickling issue for drone * typo * resolve bug * extract automatic_automization Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: chaton <thomas@grid.ai> 2020-11-03 18:01:51 +00:00			`class UnscaleAccumulateGradBatchesBoringModel(BoringModel):`

			`def on_after_backward(self):`
			`norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)`
			`if not (torch.isinf(norm) or torch.isnan(norm)):`
			`assert norm.item() < 15.`


fix import and typo in AMP (#4871) * fix import and typo * docs * apex * fix * typo 2020-11-26 22:45:52 +00:00			`@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")`
[BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jeff Yang <ydcjeff@outlook.com> 2020-11-02 16:36:48 +00:00			`@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")`
			`def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir):`
Switch to PyTorch 1.6 in Drone CI (#4393) * switch to 1.6 * readme * 1.7 * back to normal [ci skip] * horovodrun --verbose * try with apex * add apex test * change base * description * test with 1.7 * back to 1.6 * no gradient_clip_val * re-add gradient_clip_val * no amp * temp skip torch.cuda.amp + horovod test * Apply suggestion from code review Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Fix formatting * ddp * Moved extended model outside of function to prevent pickling issue for drone * typo * resolve bug * extract automatic_automization Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: chaton <thomas@grid.ai> 2020-11-03 18:01:51 +00:00			`model = UnscaleAccumulateGradBatchesBoringModel()`
[BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jeff Yang <ydcjeff@outlook.com> 2020-11-02 16:36:48 +00:00
			`trainer = Trainer(`
			`max_epochs=2,`
			`default_root_dir=os.getcwd(),`
			`limit_train_batches=2,`
			`limit_test_batches=2,`
			`limit_val_batches=2,`
			`amp_backend='native',`
			`distributed_backend='ddp_spawn',`
			`gpus=2,`
			`precision=16,`
			`track_grad_norm=2,`
			`log_every_n_steps=1,`
			`accumulate_grad_batches=2,`
			`)`
			`trainer.fit(model)`