2020-09-07 13:31:42 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2021-02-03 07:24:46 +00:00
|
|
|
from contextlib import contextmanager, suppress
|
|
|
|
from copy import copy, deepcopy
|
2020-09-25 12:18:06 +00:00
|
|
|
|
2020-09-01 22:03:28 +00:00
|
|
|
import numpy as np
|
2020-09-02 02:06:15 +00:00
|
|
|
import torch
|
2020-09-25 12:18:06 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
from pytorch_lightning.callbacks import EarlyStopping
|
2020-09-25 12:18:06 +00:00
|
|
|
from pytorch_lightning.core.memory import ModelSummary
|
2021-01-23 23:52:04 +00:00
|
|
|
from pytorch_lightning.core.optimizer import LightningOptimizer
|
2021-01-26 08:09:39 +00:00
|
|
|
from pytorch_lightning.core.step_result import Result
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
from pytorch_lightning.plugins import ParallelPlugin
|
2021-02-03 07:24:46 +00:00
|
|
|
from pytorch_lightning.trainer.states import RunningStage, TrainerState
|
|
|
|
from pytorch_lightning.trainer.supporters import Accumulator, TensorRunningAccum
|
|
|
|
from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType, parsing
|
2021-02-17 23:58:28 +00:00
|
|
|
from pytorch_lightning.utilities.distributed import rank_zero_info
|
2020-09-25 12:18:06 +00:00
|
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
|
|
|
from pytorch_lightning.utilities.memory import recursive_detach
|
2020-12-21 23:23:33 +00:00
|
|
|
from pytorch_lightning.utilities.model_helpers import is_overridden
|
2020-09-25 12:18:06 +00:00
|
|
|
from pytorch_lightning.utilities.parsing import AttributeDict
|
2020-12-21 23:23:33 +00:00
|
|
|
from pytorch_lightning.utilities.warnings import WarningCache
|
2020-09-11 01:58:47 +00:00
|
|
|
|
2020-09-01 22:03:28 +00:00
|
|
|
|
|
|
|
class TrainLoop:
|
2021-02-03 09:25:42 +00:00
|
|
|
|
2021-01-04 19:57:53 +00:00
|
|
|
def __init__(self, trainer, multiple_trainloader_mode):
|
2020-09-01 22:03:28 +00:00
|
|
|
self.trainer = trainer
|
|
|
|
self.early_stopping_accumulator = None
|
|
|
|
self.checkpoint_accumulator = None
|
2020-09-06 21:50:47 +00:00
|
|
|
self.accumulated_loss = None
|
2020-10-05 11:33:46 +00:00
|
|
|
self.warning_cache = WarningCache()
|
2020-09-02 02:06:15 +00:00
|
|
|
self._teardown_already_run = False
|
2020-09-10 11:24:42 +00:00
|
|
|
self.running_loss = TensorRunningAccum(window_length=20)
|
2020-10-10 20:44:15 +00:00
|
|
|
self.automatic_optimization = True
|
2020-10-21 18:34:29 +00:00
|
|
|
self._curr_step_result = None
|
|
|
|
self._cur_grad_norm_dict = None
|
2021-01-04 19:57:53 +00:00
|
|
|
self._multiple_trainloader_mode = multiple_trainloader_mode
|
2021-02-11 00:05:59 +00:00
|
|
|
self._skip_backward = False
|
2021-01-04 19:57:53 +00:00
|
|
|
self.trainer._multiple_trainloader_mode = multiple_trainloader_mode
|
2020-09-01 22:03:28 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
def on_trainer_init(
|
2021-01-05 07:43:18 +00:00
|
|
|
self,
|
|
|
|
max_epochs,
|
|
|
|
min_epochs,
|
|
|
|
max_steps,
|
|
|
|
min_steps,
|
|
|
|
num_sanity_val_steps,
|
|
|
|
automatic_optimization,
|
|
|
|
weights_summary,
|
2020-10-21 18:34:29 +00:00
|
|
|
):
|
2020-09-10 17:21:04 +00:00
|
|
|
self.trainer.global_step = 0
|
|
|
|
self.trainer.current_epoch = 0
|
|
|
|
self.trainer.interrupted = False
|
|
|
|
self.trainer.should_stop = False
|
|
|
|
self.trainer._state = TrainerState.INITIALIZING
|
|
|
|
|
|
|
|
self.trainer.total_batch_idx = 0
|
|
|
|
self.trainer.batch_idx = 0
|
|
|
|
self.trainer.num_training_batches = 0
|
|
|
|
self.trainer.train_dataloader = None
|
2020-10-10 20:44:15 +00:00
|
|
|
self.automatic_optimization = automatic_optimization
|
2020-09-10 17:21:04 +00:00
|
|
|
|
2021-02-10 08:51:08 +00:00
|
|
|
# If neither max_epochs or max_steps is set, then use existing default of max_epochs = 1000
|
|
|
|
self.trainer.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs
|
2021-02-13 07:36:22 +00:00
|
|
|
# If neither min_epochs or min_steps is set, then use existing default of min_epochs = 1
|
2021-02-10 08:51:08 +00:00
|
|
|
self.trainer.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
|
2020-09-10 12:55:30 +00:00
|
|
|
self.trainer.max_steps = max_steps
|
|
|
|
self.trainer.min_steps = min_steps
|
|
|
|
|
|
|
|
if num_sanity_val_steps == -1:
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.num_sanity_val_steps = float("inf")
|
2020-09-10 12:55:30 +00:00
|
|
|
else:
|
|
|
|
self.trainer.num_sanity_val_steps = num_sanity_val_steps
|
|
|
|
|
2021-01-05 07:43:18 +00:00
|
|
|
self.trainer.weights_summary = weights_summary
|
|
|
|
if weights_summary is not None and weights_summary not in ModelSummary.MODES:
|
|
|
|
raise MisconfigurationException(
|
|
|
|
f"`weights_summary` can be None, {', '.join(ModelSummary.MODES)}, got {weights_summary}"
|
|
|
|
)
|
|
|
|
|
2020-09-01 22:03:28 +00:00
|
|
|
@property
|
|
|
|
def num_optimizers(self):
|
|
|
|
num_optimizers = len(self.get_optimizers_iterable())
|
|
|
|
return num_optimizers
|
|
|
|
|
2020-11-03 06:40:35 +00:00
|
|
|
def should_skip_training(self):
|
2021-02-13 07:36:22 +00:00
|
|
|
should_by_max_steps = self.trainer.max_steps is not None and self.trainer.global_step >= self.trainer.max_steps
|
2021-02-10 08:51:08 +00:00
|
|
|
should_by_epoch = self.trainer.max_epochs is not None and self.trainer.current_epoch >= self.trainer.max_epochs
|
2021-02-13 07:36:22 +00:00
|
|
|
return should_by_max_steps or should_by_epoch or self.trainer.num_training_batches == 0
|
2020-11-03 06:40:35 +00:00
|
|
|
|
2020-09-02 02:06:15 +00:00
|
|
|
def on_train_start(self):
|
|
|
|
# hook
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.call_hook("on_train_start")
|
2020-09-02 02:06:15 +00:00
|
|
|
|
2021-01-26 11:48:54 +00:00
|
|
|
# provide rank to profiler
|
|
|
|
self.trainer.profile_connector.on_train_start(self.trainer)
|
|
|
|
|
2020-09-11 01:58:47 +00:00
|
|
|
def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
|
|
|
|
# clean hparams
|
2020-10-21 18:34:29 +00:00
|
|
|
if hasattr(model, "hparams"):
|
2020-09-11 01:58:47 +00:00
|
|
|
parsing.clean_namespace(model.hparams)
|
|
|
|
|
|
|
|
# links data to the trainer
|
|
|
|
self.trainer.data_connector.attach_data(model, train_dataloader, val_dataloaders, datamodule)
|
|
|
|
|
|
|
|
# check that model is configured correctly
|
|
|
|
self.trainer.config_validator.verify_loop_configurations(model)
|
|
|
|
|
2020-12-16 21:08:39 +00:00
|
|
|
# attach model log function to callback
|
|
|
|
self.trainer.callback_connector.attach_model_logging_functions(model)
|
|
|
|
|
2020-09-02 02:06:15 +00:00
|
|
|
def on_train_end(self):
|
|
|
|
if self._teardown_already_run:
|
|
|
|
return
|
|
|
|
|
|
|
|
self._teardown_already_run = True
|
|
|
|
|
2020-10-05 01:49:20 +00:00
|
|
|
# trigger checkpoint check. need to temporarily decrease the global step to avoid saving duplicates
|
|
|
|
# when a checkpoint was saved at the last step
|
|
|
|
self.trainer.global_step -= 1
|
2021-02-08 08:35:07 +00:00
|
|
|
self.check_checkpoint_callback(should_update=True, is_last=True)
|
2020-10-05 01:49:20 +00:00
|
|
|
self.trainer.global_step += 1
|
2020-09-02 02:06:15 +00:00
|
|
|
|
|
|
|
# hook
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.call_hook("on_train_end")
|
2020-09-02 02:06:15 +00:00
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
# todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
|
|
|
|
# It might be related to xla tensors blocked when moving the cpu
|
2020-09-02 02:06:15 +00:00
|
|
|
# kill loggers
|
2021-02-17 16:40:13 +00:00
|
|
|
if self.trainer.logger is not None:
|
2020-09-02 02:06:15 +00:00
|
|
|
self.trainer.logger.finalize("success")
|
|
|
|
|
|
|
|
# summarize profile results
|
|
|
|
if self.trainer.global_rank == 0:
|
|
|
|
self.trainer.profiler.describe()
|
|
|
|
|
2020-10-01 13:25:33 +00:00
|
|
|
# give accelerators a chance to finish
|
2021-02-15 22:38:03 +00:00
|
|
|
self.trainer.accelerator.on_train_end()
|
2020-09-02 02:06:15 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
def check_checkpoint_callback(self, should_update, is_last=False):
|
|
|
|
# TODO bake this logic into the ModelCheckpoint callback
|
|
|
|
if should_update and self.trainer.checkpoint_connector.has_trained:
|
|
|
|
callbacks = self.trainer.checkpoint_callbacks
|
2020-12-08 20:07:53 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
if is_last and any(cb.save_last for cb in callbacks):
|
2020-10-21 18:34:29 +00:00
|
|
|
rank_zero_info("Saving latest checkpoint...")
|
2020-12-08 20:07:53 +00:00
|
|
|
|
2020-09-25 12:18:06 +00:00
|
|
|
model = self.trainer.get_model()
|
2020-12-08 20:07:53 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
for cb in callbacks:
|
|
|
|
cb.on_validation_end(self.trainer, model)
|
|
|
|
|
|
|
|
def check_early_stopping_callback(self, should_update):
|
|
|
|
# TODO bake this logic into the EarlyStopping callback
|
|
|
|
if should_update and self.trainer.checkpoint_connector.has_trained:
|
|
|
|
callbacks = [c for c in self.trainer.callbacks if isinstance(c, EarlyStopping)]
|
|
|
|
model = self.trainer.get_model()
|
|
|
|
|
|
|
|
for cb in callbacks:
|
|
|
|
cb.on_validation_end(self.trainer, model)
|
2020-09-02 02:06:15 +00:00
|
|
|
|
2020-09-06 16:49:12 +00:00
|
|
|
def on_train_epoch_start(self, epoch):
|
2020-10-08 14:20:55 +00:00
|
|
|
|
|
|
|
# update training progress in trainer
|
|
|
|
self.trainer.current_epoch = epoch
|
|
|
|
|
2020-09-06 16:49:12 +00:00
|
|
|
model = self.trainer.get_model()
|
|
|
|
|
2020-10-08 14:20:55 +00:00
|
|
|
# reset train dataloader
|
2021-02-03 07:24:46 +00:00
|
|
|
if epoch != 0 and self.trainer.reload_dataloaders_every_epoch:
|
2020-10-08 14:20:55 +00:00
|
|
|
self.trainer.reset_train_dataloader(model)
|
|
|
|
|
2021-01-04 08:07:56 +00:00
|
|
|
# todo: specify the possible exception
|
|
|
|
with suppress(Exception):
|
|
|
|
# set seed for distributed sampler (enables shuffling for each epoch)
|
2020-09-06 16:49:12 +00:00
|
|
|
self.trainer.train_dataloader.sampler.set_epoch(epoch)
|
|
|
|
|
|
|
|
# changing gradient according accumulation_scheduler
|
|
|
|
self.trainer.accumulation_scheduler.on_epoch_start(self.trainer, self.trainer.get_model())
|
|
|
|
|
|
|
|
# stores accumulated grad fractions per batch
|
2020-10-21 18:34:29 +00:00
|
|
|
self.accumulated_loss = TensorRunningAccum(window_length=self.trainer.accumulate_grad_batches)
|
2020-09-01 22:03:28 +00:00
|
|
|
|
|
|
|
# structured result accumulators for callbacks
|
|
|
|
self.early_stopping_accumulator = Accumulator()
|
|
|
|
self.checkpoint_accumulator = Accumulator()
|
|
|
|
|
2020-09-06 16:49:12 +00:00
|
|
|
# hook
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.call_hook("on_epoch_start")
|
|
|
|
self.trainer.call_hook("on_train_epoch_start")
|
2020-09-06 16:49:12 +00:00
|
|
|
|
2021-01-26 13:01:46 +00:00
|
|
|
def on_train_batch_end(self, epoch_output, batch_end_outputs, batch, batch_idx, dataloader_idx):
|
2020-11-05 22:27:04 +00:00
|
|
|
# hook
|
2021-01-26 13:01:46 +00:00
|
|
|
self.trainer.call_hook('on_train_batch_end', batch_end_outputs, batch, batch_idx, dataloader_idx)
|
2021-02-04 13:00:20 +00:00
|
|
|
self.trainer.call_hook('on_batch_end')
|
2020-11-05 22:27:04 +00:00
|
|
|
|
2020-09-02 01:06:40 +00:00
|
|
|
# figure out what to track for epoch end
|
2021-01-26 13:01:46 +00:00
|
|
|
self.track_epoch_end_reduce_metrics(epoch_output, batch_end_outputs)
|
2020-09-02 01:06:40 +00:00
|
|
|
|
2020-11-05 22:27:04 +00:00
|
|
|
# reset batch logger internals
|
|
|
|
self.trainer.logger_connector.on_train_batch_end()
|
2020-09-02 01:06:40 +00:00
|
|
|
|
2020-09-02 02:06:15 +00:00
|
|
|
def reset_train_val_dataloaders(self, model):
|
2021-02-03 07:24:46 +00:00
|
|
|
if self.trainer.train_dataloader is None or not self.trainer.reload_dataloaders_every_epoch:
|
2020-09-02 02:06:15 +00:00
|
|
|
self.trainer.reset_train_dataloader(model)
|
|
|
|
|
|
|
|
if self.trainer.val_dataloaders is None and not self.trainer.reload_dataloaders_every_epoch:
|
|
|
|
self.trainer.reset_val_dataloader(model)
|
|
|
|
|
2021-01-26 13:01:46 +00:00
|
|
|
def track_epoch_end_reduce_metrics(self, epoch_output, batch_end_outputs):
|
|
|
|
|
2020-09-02 01:06:40 +00:00
|
|
|
# track the outputs to reduce at the end of the epoch
|
2021-01-26 13:01:46 +00:00
|
|
|
for opt_idx, opt_outputs in enumerate(batch_end_outputs):
|
|
|
|
sample_output = opt_outputs[-1]
|
|
|
|
|
|
|
|
# decide if we need to reduce at the end of the epoch automatically
|
|
|
|
auto_reduce_tng_result = isinstance(sample_output, Result) and sample_output.should_reduce_on_epoch_end
|
|
|
|
hook_overridden = (
|
2021-02-04 17:55:45 +00:00
|
|
|
is_overridden("training_epoch_end", model=self.trainer.get_model())
|
|
|
|
or is_overridden("on_train_epoch_end", model=self.trainer.get_model())
|
2021-01-26 13:01:46 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
# only track when a) it needs to be autoreduced OR b) the user wants to manually reduce on epoch end
|
2021-02-04 17:55:45 +00:00
|
|
|
if not (hook_overridden or auto_reduce_tng_result):
|
2021-01-26 13:01:46 +00:00
|
|
|
continue
|
|
|
|
|
2020-09-02 01:06:40 +00:00
|
|
|
# with 1 step (no tbptt) don't use a sequence at epoch end
|
|
|
|
if isinstance(opt_outputs, list) and len(opt_outputs) == 1 and not isinstance(opt_outputs[0], Result):
|
|
|
|
opt_outputs = opt_outputs[0]
|
2021-01-26 13:01:46 +00:00
|
|
|
|
2020-09-02 01:06:40 +00:00
|
|
|
epoch_output[opt_idx].append(opt_outputs)
|
|
|
|
|
2020-09-01 22:03:28 +00:00
|
|
|
def get_optimizers_iterable(self):
|
|
|
|
"""
|
|
|
|
Generates an iterable with (idx, optimizer) for each optimizer.
|
|
|
|
"""
|
|
|
|
if not self.trainer.optimizer_frequencies:
|
|
|
|
# call training_step once per optimizer
|
|
|
|
return list(enumerate(self.trainer.optimizers))
|
|
|
|
|
|
|
|
optimizer_freq_cumsum = np.cumsum(self.trainer.optimizer_frequencies)
|
|
|
|
optimizers_loop_length = optimizer_freq_cumsum[-1]
|
|
|
|
current_place_in_loop = self.trainer.total_batch_idx % optimizers_loop_length
|
|
|
|
|
|
|
|
# find optimzier index by looking for the first {item > current_place} in the cumsum list
|
|
|
|
opt_idx = np.argmax(optimizer_freq_cumsum > current_place_in_loop)
|
2020-10-10 20:44:15 +00:00
|
|
|
return [[opt_idx, self.trainer.optimizers[opt_idx]]]
|
2020-09-05 14:10:49 +00:00
|
|
|
|
|
|
|
def on_after_backward(self, training_step_output, batch_idx, untouched_loss):
|
|
|
|
is_result_obj = isinstance(training_step_output, Result)
|
|
|
|
|
|
|
|
if is_result_obj:
|
|
|
|
training_step_output.detach()
|
|
|
|
else:
|
|
|
|
training_step_output.batch_loss = training_step_output.batch_loss.detach()
|
|
|
|
|
|
|
|
# insert after step hook
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.call_hook("on_after_backward")
|
2020-09-05 14:10:49 +00:00
|
|
|
|
|
|
|
# when in dev debugging track the losses
|
|
|
|
self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach())
|
2020-09-05 19:08:58 +00:00
|
|
|
|
2020-11-10 19:44:51 +00:00
|
|
|
def _check_training_step_output(self, training_step_output):
|
|
|
|
if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization:
|
|
|
|
if training_step_output.grad_fn is None:
|
|
|
|
# TODO: Find why - RuntimeError: Expected to mark a variable ready only once ...
|
|
|
|
raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor")
|
|
|
|
|
2020-09-05 19:08:58 +00:00
|
|
|
def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
|
2020-09-28 03:19:46 +00:00
|
|
|
# give the PL module a result for logging
|
2020-11-05 22:27:04 +00:00
|
|
|
model_ref = self.trainer.get_model()
|
2020-09-28 03:19:46 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
with self.trainer.profiler.profile("model_forward"):
|
2020-09-07 00:38:31 +00:00
|
|
|
args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens)
|
2020-11-05 22:27:04 +00:00
|
|
|
|
|
|
|
# manually capture logged metrics
|
|
|
|
model_ref._current_fx_name = 'training_step'
|
2020-11-27 18:21:23 +00:00
|
|
|
model_ref._results = Result()
|
2021-01-26 11:48:54 +00:00
|
|
|
with self.trainer.profiler.profile("training_step"):
|
|
|
|
training_step_output = self.trainer.accelerator_backend.training_step(args)
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
self.trainer.accelerator_backend.post_training_step()
|
|
|
|
|
2020-11-05 22:27:04 +00:00
|
|
|
self.trainer.logger_connector.cache_logged_metrics()
|
|
|
|
|
2020-11-10 19:44:51 +00:00
|
|
|
self._check_training_step_output(training_step_output)
|
|
|
|
|
2020-11-27 18:21:23 +00:00
|
|
|
training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
|
2020-09-05 19:08:58 +00:00
|
|
|
|
2020-09-28 03:19:46 +00:00
|
|
|
training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
|
2020-10-21 18:34:29 +00:00
|
|
|
training_step_output, split_batch
|
2020-09-28 03:19:46 +00:00
|
|
|
)
|
2020-09-05 19:08:58 +00:00
|
|
|
is_result_obj = isinstance(training_step_output, Result)
|
|
|
|
|
2020-10-05 11:33:46 +00:00
|
|
|
if training_step_output_for_epoch_end is None:
|
|
|
|
return None
|
|
|
|
|
2020-10-12 14:06:17 +00:00
|
|
|
# enable empty loss when using manual opt
|
|
|
|
closure_loss = None
|
|
|
|
untouched_loss = None
|
|
|
|
|
|
|
|
if self.trainer.train_loop.automatic_optimization:
|
|
|
|
# accumulate loss
|
|
|
|
# (if accumulate_grad_batches = 1 no effect)
|
|
|
|
if is_result_obj:
|
|
|
|
closure_loss = training_step_output.minimize
|
|
|
|
else:
|
|
|
|
closure_loss = training_step_output.batch_loss
|
2020-09-28 03:19:46 +00:00
|
|
|
|
2020-10-12 14:06:17 +00:00
|
|
|
closure_loss = closure_loss / self.trainer.accumulate_grad_batches
|
2020-09-05 19:08:58 +00:00
|
|
|
|
2020-10-12 14:06:17 +00:00
|
|
|
# the loss will get scaled for amp. avoid any modifications to it
|
|
|
|
untouched_loss = closure_loss.detach().clone()
|
2020-09-05 19:08:58 +00:00
|
|
|
|
|
|
|
# result
|
|
|
|
result = AttributeDict(
|
|
|
|
closure_loss=closure_loss,
|
|
|
|
loss=untouched_loss,
|
|
|
|
training_step_output=training_step_output,
|
|
|
|
training_step_output_for_epoch_end=training_step_output_for_epoch_end,
|
|
|
|
hiddens=training_step_output.hiddens,
|
|
|
|
)
|
|
|
|
return result
|
2020-09-05 23:17:19 +00:00
|
|
|
|
2020-09-28 03:19:46 +00:00
|
|
|
def _process_training_step_output(self, training_step_output, split_batch):
|
|
|
|
training_step_output_for_epoch_end = training_step_output
|
|
|
|
|
2020-10-05 11:33:46 +00:00
|
|
|
# enable validation_step return None
|
|
|
|
if training_step_output_for_epoch_end is None:
|
|
|
|
return None, None
|
|
|
|
|
2020-09-28 03:19:46 +00:00
|
|
|
# -----------------------------------------
|
|
|
|
# process hybrid (1.0)
|
|
|
|
# -----------------------------------------
|
|
|
|
# no need for these checks in 1.0.0
|
|
|
|
# TODO: remove checks in 1.0.0
|
|
|
|
is_tensor = isinstance(training_step_output_for_epoch_end, torch.Tensor)
|
2020-10-21 18:34:29 +00:00
|
|
|
is_1_0_output = is_tensor or ("log" not in training_step_output and "progress_bar" not in training_step_output)
|
2020-09-28 03:19:46 +00:00
|
|
|
if is_1_0_output:
|
|
|
|
return self._process_training_step_output_1_0(training_step_output, split_batch)
|
|
|
|
|
|
|
|
# -----------------------------------------
|
|
|
|
# process old dict (deprecate 1.0)
|
|
|
|
# -----------------------------------------
|
|
|
|
training_step_output = self.trainer.process_dict_result(training_step_output, train=True)
|
|
|
|
|
|
|
|
training_step_output = AttributeDict(
|
|
|
|
batch_loss=training_step_output[0],
|
|
|
|
pbar_on_batch_end=training_step_output[1],
|
|
|
|
log_metrics=training_step_output[2],
|
|
|
|
callback_metrics=training_step_output[3],
|
|
|
|
hiddens=training_step_output[4],
|
|
|
|
)
|
|
|
|
# if the user decides to finally reduce things in epoch_end, save raw output without graphs
|
|
|
|
if isinstance(training_step_output_for_epoch_end, torch.Tensor):
|
|
|
|
training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach()
|
|
|
|
else:
|
|
|
|
training_step_output_for_epoch_end = recursive_detach(training_step_output_for_epoch_end)
|
|
|
|
|
|
|
|
return training_step_output_for_epoch_end, training_step_output
|
|
|
|
|
|
|
|
def _process_training_step_output_1_0(self, training_step_output, split_batch):
|
|
|
|
result = self.trainer.get_model()._results
|
|
|
|
|
|
|
|
loss = None
|
|
|
|
hiddens = None
|
|
|
|
|
|
|
|
# handle dict return
|
|
|
|
if isinstance(training_step_output, dict):
|
2020-10-21 18:34:29 +00:00
|
|
|
loss = training_step_output.pop("loss", None)
|
|
|
|
hiddens = training_step_output.pop("hiddens", None)
|
|
|
|
result["extra"] = training_step_output
|
2020-09-28 03:19:46 +00:00
|
|
|
|
|
|
|
# handle scalar return
|
|
|
|
elif isinstance(training_step_output, torch.Tensor):
|
|
|
|
loss = training_step_output
|
2020-10-21 18:34:29 +00:00
|
|
|
result["extra"] = {}
|
2020-09-28 03:19:46 +00:00
|
|
|
|
|
|
|
# map to results under the hood
|
|
|
|
result.minimize = loss
|
|
|
|
result.hiddens = hiddens
|
|
|
|
|
|
|
|
# track batch for manual reduction with result
|
|
|
|
result.track_batch_size(len(split_batch))
|
|
|
|
|
|
|
|
# track metrics without grads for epoch reduction
|
|
|
|
training_step_output_for_epoch_end = copy(result)
|
|
|
|
training_step_output_for_epoch_end.detach()
|
2020-11-10 21:13:41 +00:00
|
|
|
if self.trainer.move_metrics_to_cpu:
|
|
|
|
training_step_output_for_epoch_end.cpu()
|
2020-09-28 03:19:46 +00:00
|
|
|
|
|
|
|
# what flows back into the system
|
|
|
|
training_step_output = result
|
|
|
|
|
|
|
|
return training_step_output_for_epoch_end, training_step_output
|
|
|
|
|
2020-12-11 19:24:59 +00:00
|
|
|
def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure):
|
2020-12-07 12:55:49 +00:00
|
|
|
model_ref = self.trainer.get_model()
|
|
|
|
|
|
|
|
is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
|
|
|
|
using_native_amp = self.trainer.amp_backend == AMPType.NATIVE
|
|
|
|
|
|
|
|
# native amp + lbfgs is a no go right now
|
|
|
|
if using_native_amp and is_lbfgs:
|
|
|
|
raise MisconfigurationException(
|
|
|
|
'native PyTorch amp and lbfgs are not compatible.'
|
2021-02-03 09:25:42 +00:00
|
|
|
' To request, please file a Github issue in PyTorch and tag @mcarilli'
|
|
|
|
)
|
2020-12-07 12:55:49 +00:00
|
|
|
|
2021-02-01 19:11:18 +00:00
|
|
|
# wraps into LightningOptimizer only for running step
|
2021-01-08 21:13:12 +00:00
|
|
|
optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx)
|
|
|
|
|
2020-12-07 12:55:49 +00:00
|
|
|
# model hook
|
|
|
|
model_ref.optimizer_step(
|
2020-12-11 19:24:59 +00:00
|
|
|
self.trainer.current_epoch,
|
|
|
|
batch_idx,
|
|
|
|
optimizer,
|
|
|
|
opt_idx,
|
|
|
|
train_step_and_backward_closure,
|
2021-01-12 10:22:37 +00:00
|
|
|
on_tpu=self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE,
|
2020-12-07 12:55:49 +00:00
|
|
|
using_native_amp=using_native_amp,
|
|
|
|
using_lbfgs=is_lbfgs,
|
|
|
|
)
|
2020-09-05 23:17:19 +00:00
|
|
|
|
2020-09-06 00:00:20 +00:00
|
|
|
def on_before_zero_grad(self, optimizer):
|
2020-10-28 12:15:22 +00:00
|
|
|
self.trainer.call_hook('on_before_zero_grad', optimizer)
|
2020-09-05 23:17:19 +00:00
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
|
|
|
|
self.trainer.accelerator_backend.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
def track_and_norm_grad(self, optimizer):
|
2020-09-05 23:17:19 +00:00
|
|
|
# track gradient norms
|
2020-09-30 10:26:27 +00:00
|
|
|
grad_norm_dic = self._track_gradient_norm()
|
2020-09-05 23:17:19 +00:00
|
|
|
|
|
|
|
# clip gradients
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
self.trainer.accelerator_backend.clip_gradients(optimizer, self.trainer.gradient_clip_val)
|
2020-10-21 18:34:29 +00:00
|
|
|
self._cur_grad_norm_dict = grad_norm_dic
|
2020-09-05 23:17:19 +00:00
|
|
|
|
2020-09-30 10:26:27 +00:00
|
|
|
def _track_gradient_norm(self):
|
2020-09-15 16:41:27 +00:00
|
|
|
grad_norm_dict = {}
|
2020-10-06 14:27:06 +00:00
|
|
|
if (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0:
|
2020-09-05 23:17:19 +00:00
|
|
|
if float(self.trainer.track_grad_norm) > 0:
|
|
|
|
model = self.trainer.get_model()
|
2020-09-15 16:41:27 +00:00
|
|
|
grad_norm_dict = model.grad_norm(self.trainer.track_grad_norm)
|
|
|
|
return grad_norm_dict
|
2020-09-06 12:04:08 +00:00
|
|
|
|
|
|
|
def process_hiddens(self, opt_closure_result):
|
|
|
|
hiddens = opt_closure_result.hiddens
|
|
|
|
if isinstance(opt_closure_result.training_step_output, Result):
|
|
|
|
opt_closure_result.training_step_output_for_epoch_end.drop_hiddens()
|
|
|
|
return hiddens
|
2020-09-06 12:59:58 +00:00
|
|
|
|
|
|
|
def tbptt_split_batch(self, batch):
|
|
|
|
splits = [batch]
|
|
|
|
if self.trainer.truncated_bptt_steps is not None:
|
|
|
|
model_ref = self.trainer.get_model()
|
2020-10-21 18:34:29 +00:00
|
|
|
with self.trainer.profiler.profile("tbptt_split_batch"):
|
2020-09-06 12:59:58 +00:00
|
|
|
splits = model_ref.tbptt_split_batch(batch, self.trainer.truncated_bptt_steps)
|
|
|
|
return splits
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
def run_training_epoch(self):
|
|
|
|
# modify dataloader if needed (ddp, etc...)
|
2021-02-17 16:40:13 +00:00
|
|
|
train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader)
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
# track epoch output
|
|
|
|
epoch_output = [[] for _ in range(self.num_optimizers)]
|
|
|
|
|
|
|
|
train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader)
|
|
|
|
dataloader_idx = 0
|
2020-09-25 12:18:06 +00:00
|
|
|
should_check_val = False
|
2021-01-04 19:57:53 +00:00
|
|
|
|
2020-09-06 23:55:18 +00:00
|
|
|
for batch_idx, (batch, is_last_batch) in train_dataloader:
|
2020-10-23 10:17:50 +00:00
|
|
|
|
2020-09-06 23:55:18 +00:00
|
|
|
self.trainer.batch_idx = batch_idx
|
|
|
|
|
|
|
|
# ------------------------------------
|
|
|
|
# TRAINING_STEP + TRAINING_STEP_END
|
|
|
|
# ------------------------------------
|
2020-11-27 18:00:48 +00:00
|
|
|
with self.trainer.profiler.profile("run_training_batch"):
|
|
|
|
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
|
2020-09-06 23:55:18 +00:00
|
|
|
|
2020-10-02 19:46:46 +00:00
|
|
|
# when returning -1 from train_step, we end epoch early
|
|
|
|
if batch_output.signal == -1:
|
|
|
|
break
|
|
|
|
|
2021-01-26 13:01:46 +00:00
|
|
|
batch_end_outputs = self.process_train_step_outputs(
|
2020-09-06 23:55:18 +00:00
|
|
|
batch_output.training_step_output_for_epoch_end,
|
|
|
|
self.early_stopping_accumulator,
|
2020-10-21 18:34:29 +00:00
|
|
|
self.checkpoint_accumulator,
|
2020-09-06 23:55:18 +00:00
|
|
|
)
|
|
|
|
# hook
|
2020-09-28 03:19:46 +00:00
|
|
|
# TODO: add outputs to batches
|
2021-01-26 13:01:46 +00:00
|
|
|
self.on_train_batch_end(epoch_output, batch_end_outputs, batch, batch_idx, dataloader_idx)
|
2020-09-06 23:55:18 +00:00
|
|
|
|
2020-10-03 16:33:29 +00:00
|
|
|
# -----------------------------------------
|
|
|
|
# SAVE METRICS TO LOGGERS
|
|
|
|
# -----------------------------------------
|
|
|
|
self.trainer.logger_connector.log_train_step_metrics(batch_output)
|
|
|
|
|
2020-09-06 23:55:18 +00:00
|
|
|
# -----------------------------------------
|
|
|
|
# VALIDATE IF NEEDED + CHECKPOINT CALLBACK
|
|
|
|
# -----------------------------------------
|
2020-09-07 00:38:31 +00:00
|
|
|
should_check_val = self.should_check_val_fx(batch_idx, is_last_batch)
|
2020-09-06 23:55:18 +00:00
|
|
|
if should_check_val:
|
2021-02-01 17:31:57 +00:00
|
|
|
self.trainer.run_evaluation()
|
2021-01-27 16:38:14 +00:00
|
|
|
|
2020-11-05 22:27:04 +00:00
|
|
|
# reset stage to train
|
2021-02-16 22:11:56 +00:00
|
|
|
self.trainer._set_running_stage(RunningStage.TRAINING, self.trainer.lightning_module)
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
# -----------------------------------------
|
2020-09-28 03:19:46 +00:00
|
|
|
# SAVE LOGGERS (ie: Tensorboard, etc...)
|
2020-09-06 23:55:18 +00:00
|
|
|
# -----------------------------------------
|
2020-09-30 10:26:27 +00:00
|
|
|
self.save_loggers_on_train_batch_end()
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
# update LR schedulers
|
2020-09-07 13:31:42 +00:00
|
|
|
monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
|
2020-09-07 00:38:31 +00:00
|
|
|
self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
|
2020-11-03 06:08:32 +00:00
|
|
|
self.trainer.checkpoint_connector.has_trained = True
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
# max steps reached, end training
|
2021-02-08 08:35:07 +00:00
|
|
|
if (
|
2021-02-08 18:54:43 +00:00
|
|
|
self.trainer.max_steps is not None and self.trainer.max_steps == self.trainer.global_step + 1
|
2021-02-08 08:35:07 +00:00
|
|
|
and self._accumulated_batches_reached()
|
|
|
|
):
|
|
|
|
break
|
2020-09-06 23:55:18 +00:00
|
|
|
|
|
|
|
# end epoch early
|
|
|
|
# stop when the flag is changed or we've gone past the amount
|
|
|
|
# requested in the batches
|
|
|
|
if self.trainer.should_stop:
|
|
|
|
break
|
|
|
|
|
2020-09-28 00:19:51 +00:00
|
|
|
self.trainer.total_batch_idx += 1
|
|
|
|
|
|
|
|
# stop epoch if we limited the number of training batches
|
2021-02-08 08:35:07 +00:00
|
|
|
if self._num_training_batches_reached(is_last_batch):
|
2020-09-28 00:19:51 +00:00
|
|
|
break
|
|
|
|
|
|
|
|
# progress global step according to grads progress
|
|
|
|
self.increment_accumulated_grad_global_step()
|
|
|
|
|
2020-11-05 22:27:04 +00:00
|
|
|
# epoch end hook
|
|
|
|
self.run_on_epoch_end_hook(epoch_output)
|
|
|
|
|
2020-09-28 14:49:07 +00:00
|
|
|
# log epoch metrics
|
|
|
|
self.trainer.logger_connector.log_train_epoch_end_metrics(
|
2021-02-03 09:25:42 +00:00
|
|
|
epoch_output, self.checkpoint_accumulator, self.early_stopping_accumulator, self.num_optimizers
|
2020-09-06 23:55:18 +00:00
|
|
|
)
|
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
should_check_val = self.should_check_val_fx(batch_idx, is_last_batch, on_epoch=True)
|
|
|
|
if should_check_val:
|
|
|
|
self.trainer.run_evaluation(on_epoch=True)
|
2021-02-08 18:54:43 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
# reset stage to train
|
2021-02-16 22:11:56 +00:00
|
|
|
self.trainer._set_running_stage(RunningStage.TRAINING, self.trainer.lightning_module)
|
2021-02-08 08:35:07 +00:00
|
|
|
|
|
|
|
should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches)
|
|
|
|
should_train_only = self.trainer.disable_validation or should_skip_eval
|
|
|
|
|
|
|
|
if should_train_only:
|
|
|
|
# update epoch level lr_schedulers
|
|
|
|
self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
|
|
|
|
self.check_checkpoint_callback(True)
|
|
|
|
self.check_early_stopping_callback(True)
|
2020-09-07 00:38:31 +00:00
|
|
|
|
2020-09-28 03:19:46 +00:00
|
|
|
# increment the global step once
|
2020-09-28 00:19:51 +00:00
|
|
|
# progress global step according to grads progress
|
|
|
|
self.increment_accumulated_grad_global_step()
|
|
|
|
|
2020-09-08 16:05:00 +00:00
|
|
|
def run_training_batch(self, batch, batch_idx, dataloader_idx):
|
|
|
|
# track grad norms
|
|
|
|
grad_norm_dic = {}
|
|
|
|
|
|
|
|
# bookkeeping
|
|
|
|
self.trainer.hiddens = None
|
|
|
|
|
|
|
|
# track all outputs across time and num of optimizers
|
|
|
|
batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))]
|
|
|
|
|
|
|
|
if batch is None:
|
|
|
|
return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)
|
|
|
|
|
|
|
|
# hook
|
2020-10-21 18:34:29 +00:00
|
|
|
response = self.trainer.call_hook("on_batch_start")
|
2020-09-08 16:05:00 +00:00
|
|
|
if response == -1:
|
|
|
|
return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
|
|
|
|
|
|
|
|
# hook
|
2020-10-21 18:34:29 +00:00
|
|
|
response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
|
2020-09-08 16:05:00 +00:00
|
|
|
if response == -1:
|
|
|
|
return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
|
|
|
|
|
|
|
|
# lightning module hook
|
|
|
|
splits = self.tbptt_split_batch(batch)
|
|
|
|
|
|
|
|
for split_idx, split_batch in enumerate(splits):
|
|
|
|
|
2020-11-02 20:51:43 +00:00
|
|
|
# create an iterable for optimizers and loop over them
|
|
|
|
for opt_idx, optimizer in self.prepare_optimizers():
|
2020-10-10 20:44:15 +00:00
|
|
|
|
2020-11-02 20:51:43 +00:00
|
|
|
# toggle model params + set info to logger_connector
|
|
|
|
self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer)
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-11-02 16:36:48 +00:00
|
|
|
if self.should_accumulate():
|
2020-10-21 18:34:29 +00:00
|
|
|
# For gradient accumulation
|
|
|
|
|
|
|
|
# -------------------
|
|
|
|
# calculate loss (train step + train step end)
|
|
|
|
# -------------------
|
2020-10-29 17:31:37 +00:00
|
|
|
|
2020-12-07 19:31:54 +00:00
|
|
|
# automatic_optimization=True: perform dpp sync only when performing optimizer_step
|
|
|
|
# automatic_optimization=False: don't block synchronization here
|
2020-10-29 17:31:37 +00:00
|
|
|
with self.block_ddp_sync_behaviour():
|
2020-12-07 19:31:54 +00:00
|
|
|
self.training_step_and_backward(
|
2021-02-03 09:25:42 +00:00
|
|
|
split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
|
|
|
|
)
|
2020-10-29 17:31:37 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
batch_outputs = self._process_closure_result(
|
|
|
|
batch_outputs=batch_outputs,
|
|
|
|
opt_idx=opt_idx,
|
|
|
|
)
|
2020-09-28 03:19:46 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
# ------------------------------
|
|
|
|
# BACKWARD PASS
|
|
|
|
# ------------------------------
|
|
|
|
# gradient update with accumulated gradients
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
else:
|
|
|
|
if self.automatic_optimization:
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
def train_step_and_backward_closure():
|
|
|
|
result = self.training_step_and_backward(
|
2021-02-03 09:25:42 +00:00
|
|
|
split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
|
|
|
return None if result is None else result.loss
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
# optimizer step
|
|
|
|
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
|
2020-10-12 14:06:17 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
else:
|
2020-11-05 22:27:04 +00:00
|
|
|
self._curr_step_result = self.training_step(
|
2021-02-03 09:25:42 +00:00
|
|
|
split_batch, batch_idx, opt_idx, self.trainer.hiddens
|
2020-11-05 22:27:04 +00:00
|
|
|
)
|
2020-10-12 14:06:17 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
if self._curr_step_result is None:
|
|
|
|
# user decided to skip optimization
|
2020-11-10 19:44:51 +00:00
|
|
|
# make sure to zero grad.
|
2020-10-21 18:34:29 +00:00
|
|
|
continue
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
batch_outputs = self._process_closure_result(
|
|
|
|
batch_outputs=batch_outputs,
|
|
|
|
opt_idx=opt_idx,
|
|
|
|
)
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-11-02 20:51:43 +00:00
|
|
|
# todo: Properly aggregate grad_norm accros opt_idx and split_idx
|
2020-10-21 18:34:29 +00:00
|
|
|
grad_norm_dic = self._cur_grad_norm_dict
|
|
|
|
self._cur_grad_norm_dict = None
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-11-02 20:51:43 +00:00
|
|
|
# update running loss + reset accumulated loss
|
|
|
|
self.update_running_loss()
|
2020-09-08 16:05:00 +00:00
|
|
|
|
|
|
|
result = AttributeDict(
|
|
|
|
signal=0,
|
|
|
|
grad_norm_dic=grad_norm_dic,
|
2020-10-21 18:34:29 +00:00
|
|
|
training_step_output_for_epoch_end=batch_outputs,
|
2020-09-08 16:05:00 +00:00
|
|
|
)
|
|
|
|
return result
|
|
|
|
|
2020-10-29 17:31:37 +00:00
|
|
|
@contextmanager
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
def block_ddp_sync_behaviour(self, should_block_sync: bool = False):
|
2020-12-07 19:31:54 +00:00
|
|
|
"""
|
|
|
|
automatic_optimization = True
|
|
|
|
Blocks ddp sync gradients behaviour on backwards pass.
|
|
|
|
This is useful for skipping sync when accumulating gradients, reducing communication overhead
|
|
|
|
|
|
|
|
automatic_optimization = False
|
|
|
|
do not block ddp gradient sync when using manual optimization
|
|
|
|
as gradients are needed within the training step
|
|
|
|
|
2021-01-26 01:21:00 +00:00
|
|
|
Returns:
|
|
|
|
context manager with sync behaviour off
|
2020-12-07 19:31:54 +00:00
|
|
|
|
|
|
|
"""
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
if (
|
|
|
|
isinstance(self.trainer.training_type_plugin, ParallelPlugin)
|
|
|
|
and (self.automatic_optimization or should_block_sync)
|
|
|
|
):
|
|
|
|
with self.trainer.training_type_plugin.block_backward_sync():
|
|
|
|
yield None
|
2020-10-29 17:31:37 +00:00
|
|
|
else:
|
2020-12-07 19:31:54 +00:00
|
|
|
yield None
|
2020-10-29 17:31:37 +00:00
|
|
|
|
2021-02-03 09:25:42 +00:00
|
|
|
def _process_closure_result(self, batch_outputs: list, opt_idx: int) -> list:
|
2020-10-21 18:34:29 +00:00
|
|
|
opt_closure_result = self._curr_step_result
|
|
|
|
|
|
|
|
if opt_closure_result is not None:
|
|
|
|
|
2020-11-05 22:27:04 +00:00
|
|
|
# cache metrics
|
|
|
|
self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result)
|
2020-10-21 18:34:29 +00:00
|
|
|
|
|
|
|
# track hiddens
|
|
|
|
self.trainer.hiddens = self.process_hiddens(opt_closure_result)
|
|
|
|
|
|
|
|
# check if loss or model weights are nan
|
|
|
|
if self.trainer.terminate_on_nan:
|
|
|
|
self.trainer.detect_nan_tensors(opt_closure_result.loss)
|
|
|
|
|
|
|
|
# track all the outputs across all steps
|
|
|
|
batch_opt_idx = opt_idx if len(batch_outputs) > 1 else 0
|
|
|
|
batch_outputs[batch_opt_idx].append(opt_closure_result.training_step_output_for_epoch_end)
|
|
|
|
|
|
|
|
if self.automatic_optimization:
|
|
|
|
# track total loss for logging (avoid mem leaks)
|
|
|
|
self.accumulated_loss.append(opt_closure_result.loss)
|
|
|
|
|
|
|
|
self._curr_step_result = None
|
|
|
|
|
|
|
|
return batch_outputs
|
|
|
|
|
2020-09-07 00:48:29 +00:00
|
|
|
def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens):
|
|
|
|
"""
|
|
|
|
wrap the forward step in a closure so second order methods work
|
|
|
|
"""
|
2020-11-27 18:00:48 +00:00
|
|
|
with self.trainer.profiler.profile("training_step_and_backward"):
|
|
|
|
# lightning module hook
|
|
|
|
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
|
|
|
|
self._curr_step_result = result
|
2020-09-07 00:48:29 +00:00
|
|
|
|
2021-02-11 00:05:59 +00:00
|
|
|
if not self._skip_backward and self.trainer.train_loop.automatic_optimization:
|
2020-11-27 18:00:48 +00:00
|
|
|
# backward pass
|
2021-02-17 19:55:09 +00:00
|
|
|
if result is not None:
|
|
|
|
with self.trainer.profiler.profile("model_backward"):
|
|
|
|
self.backward(result, optimizer, opt_idx)
|
2020-09-07 00:48:29 +00:00
|
|
|
|
2021-02-17 19:55:09 +00:00
|
|
|
# hook - call this hook only
|
|
|
|
# when gradients have finished to accumulate
|
|
|
|
if not self.should_accumulate():
|
|
|
|
self.on_after_backward(result.training_step_output, batch_idx, result.loss)
|
2020-09-07 00:48:29 +00:00
|
|
|
|
2021-02-17 19:55:09 +00:00
|
|
|
# check if loss or model weights are nan
|
|
|
|
if self.trainer.terminate_on_nan:
|
|
|
|
self.trainer.detect_nan_tensors(result.loss)
|
|
|
|
|
|
|
|
else:
|
|
|
|
self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...")
|
2020-10-21 18:34:29 +00:00
|
|
|
|
2021-01-25 23:45:49 +00:00
|
|
|
if len(self.trainer.optimizers) > 1:
|
|
|
|
# revert back to previous state
|
|
|
|
self.trainer.get_model().untoggle_optimizer(opt_idx)
|
|
|
|
|
2020-09-07 00:48:29 +00:00
|
|
|
return result
|
|
|
|
|
2020-10-11 00:05:05 +00:00
|
|
|
def backward(self, result, optimizer, opt_idx, *args, **kwargs):
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.dev_debugger.track_event("backward_call")
|
2020-10-10 20:44:15 +00:00
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
should_accumulate = self.should_accumulate()
|
|
|
|
|
2020-10-11 00:05:05 +00:00
|
|
|
# backward can be called manually in the training loop
|
2020-10-10 20:44:15 +00:00
|
|
|
if isinstance(result, torch.Tensor):
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs)
|
2020-10-10 20:44:15 +00:00
|
|
|
else:
|
|
|
|
result.closure_loss = self.trainer.accelerator_backend.backward(
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
|
2020-10-10 20:44:15 +00:00
|
|
|
)
|
2020-10-10 16:19:22 +00:00
|
|
|
|
2020-11-02 16:36:48 +00:00
|
|
|
if not self.should_accumulate():
|
|
|
|
# track gradients
|
|
|
|
self.track_and_norm_grad(optimizer=optimizer)
|
|
|
|
|
2020-09-07 00:38:31 +00:00
|
|
|
def update_train_loop_lr_schedulers(self, monitor_metrics=None):
|
2020-10-22 14:15:04 +00:00
|
|
|
num_accumulated_batches_reached = self._accumulated_batches_reached()
|
|
|
|
num_training_batches_reached = self._num_training_batches_reached()
|
2020-09-07 00:38:31 +00:00
|
|
|
|
|
|
|
if num_accumulated_batches_reached or num_training_batches_reached:
|
|
|
|
# update lr
|
2020-10-21 18:34:29 +00:00
|
|
|
self.trainer.optimizer_connector.update_learning_rates(interval="step", monitor_metrics=monitor_metrics)
|
2020-09-07 00:38:31 +00:00
|
|
|
|
2020-10-08 02:27:36 +00:00
|
|
|
def run_on_epoch_end_hook(self, epoch_output):
|
2020-12-01 09:26:52 +00:00
|
|
|
# inform logger the batch loop has finished
|
|
|
|
self.trainer.logger_connector.on_train_epoch_end()
|
|
|
|
|
2020-12-06 13:01:43 +00:00
|
|
|
self.trainer.call_hook('on_train_epoch_end', epoch_output)
|
2021-02-04 13:00:20 +00:00
|
|
|
self.trainer.call_hook('on_epoch_end')
|
2020-09-07 00:38:31 +00:00
|
|
|
|
|
|
|
def increment_accumulated_grad_global_step(self):
|
2020-10-22 14:15:04 +00:00
|
|
|
num_accumulated_batches_reached = self._accumulated_batches_reached()
|
|
|
|
num_training_batches_reached = self._num_training_batches_reached()
|
2020-09-07 00:38:31 +00:00
|
|
|
|
|
|
|
# progress global step according to grads progress
|
|
|
|
if num_accumulated_batches_reached or num_training_batches_reached:
|
|
|
|
self.trainer.global_step += 1
|
|
|
|
|
2020-10-22 14:15:04 +00:00
|
|
|
def _accumulated_batches_reached(self):
|
2020-10-22 12:58:59 +00:00
|
|
|
return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0
|
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
def _num_training_batches_reached(self, is_last_batch=False):
|
|
|
|
return (self.trainer.batch_idx + 1) == self.trainer.num_training_batches or is_last_batch
|
2020-10-22 12:58:59 +00:00
|
|
|
|
2020-11-02 16:36:48 +00:00
|
|
|
def should_accumulate(self):
|
|
|
|
# checks if backward or backward + optimizer step (via closure)
|
|
|
|
accumulation_done = self._accumulated_batches_reached()
|
|
|
|
is_final_batch = self._num_training_batches_reached()
|
|
|
|
return not (accumulation_done or is_final_batch)
|
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
def should_check_val_fx(self, batch_idx, is_last_batch, on_epoch=False):
|
2020-09-07 00:38:31 +00:00
|
|
|
# decide if we should run validation
|
|
|
|
is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
|
2020-09-25 12:18:06 +00:00
|
|
|
is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0
|
|
|
|
can_check_val = self.trainer.enable_validation and is_val_check_epoch
|
2020-10-21 18:34:29 +00:00
|
|
|
is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf")
|
2021-02-08 08:35:07 +00:00
|
|
|
epoch_end_val_check = self.trainer.val_check_batch == self.trainer.num_training_batches
|
|
|
|
|
2021-02-08 18:54:43 +00:00
|
|
|
should_check_val = ((is_val_check_batch and epoch_end_val_check) or self.trainer.should_stop
|
|
|
|
or is_last_batch_for_infinite_dataset
|
|
|
|
) if on_epoch else (is_val_check_batch and not epoch_end_val_check)
|
2020-09-07 00:38:31 +00:00
|
|
|
|
2021-02-08 08:35:07 +00:00
|
|
|
return should_check_val and can_check_val
|
2020-09-07 00:38:31 +00:00
|
|
|
|
|
|
|
def build_train_args(self, batch, batch_idx, opt_idx, hiddens):
|
|
|
|
# enable not needing to add opt_idx to training_step
|
|
|
|
args = [batch, batch_idx]
|
|
|
|
|
|
|
|
if len(self.trainer.optimizers) > 1:
|
2020-10-21 18:34:29 +00:00
|
|
|
if self.trainer.has_arg("training_step", "optimizer_idx"):
|
2020-09-07 00:38:31 +00:00
|
|
|
args.append(opt_idx)
|
|
|
|
else:
|
|
|
|
num_opts = len(self.trainer.optimizers)
|
|
|
|
raise ValueError(
|
2020-10-21 18:34:29 +00:00
|
|
|
f"Your LightningModule defines {num_opts} optimizers but "
|
2020-09-07 00:38:31 +00:00
|
|
|
f'training_step is missing the "optimizer_idx" argument.'
|
|
|
|
)
|
|
|
|
|
|
|
|
# pass hiddens if using tbptt
|
|
|
|
if self.trainer.truncated_bptt_steps is not None:
|
|
|
|
args.append(hiddens)
|
|
|
|
|
|
|
|
return args
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-09-30 10:26:27 +00:00
|
|
|
def save_loggers_on_train_batch_end(self):
|
2020-09-08 16:05:00 +00:00
|
|
|
# when loggers should save to disk
|
2020-11-05 22:27:04 +00:00
|
|
|
should_flush_logs = self.trainer.logger_connector.should_flush_logs
|
2021-01-05 02:54:49 +00:00
|
|
|
if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None:
|
|
|
|
self.trainer.logger.save()
|
2020-09-08 16:05:00 +00:00
|
|
|
|
|
|
|
def process_train_step_outputs(self, all_train_step_outputs, early_stopping_accumulator, checkpoint_accumulator):
|
|
|
|
"""
|
|
|
|
Figure out what needs to be tracked/logged at the end of the epoch
|
|
|
|
"""
|
|
|
|
|
|
|
|
# the training step outputs a list per optimizer. The list contains the outputs at each time step
|
|
|
|
# when no TBPTT is used, then the list has 1 item per batch
|
|
|
|
# when TBPTT IS used, then the list has n items (1 per time step)
|
2021-01-26 13:01:46 +00:00
|
|
|
batch_end_outputs = []
|
2020-09-08 16:05:00 +00:00
|
|
|
for optimizer_idx_outputs in all_train_step_outputs:
|
|
|
|
# extract one representative sample from each time step (1 if no tbptt) and 0th optimizer
|
2020-10-05 11:33:46 +00:00
|
|
|
if len(optimizer_idx_outputs) == 0:
|
|
|
|
continue
|
|
|
|
|
2020-09-08 16:05:00 +00:00
|
|
|
sample_output = optimizer_idx_outputs[-1]
|
|
|
|
|
|
|
|
# pull out callback info if available (ie: Results object)
|
2020-10-21 18:34:29 +00:00
|
|
|
if isinstance(sample_output, dict) and "early_stop_on" in sample_output:
|
|
|
|
early_stopping_accumulator.accumulate(sample_output["early_stop_on"])
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
if isinstance(sample_output, dict) and "checkpoint_on" in sample_output:
|
|
|
|
checkpoint_accumulator.accumulate(sample_output["checkpoint_on"])
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2021-01-26 13:01:46 +00:00
|
|
|
batch_end_outputs.append(optimizer_idx_outputs)
|
2020-09-08 16:05:00 +00:00
|
|
|
|
2021-01-26 13:01:46 +00:00
|
|
|
return batch_end_outputs
|
2020-11-02 20:51:43 +00:00
|
|
|
|
|
|
|
def prepare_optimizers(self):
|
|
|
|
# in manual optimization we loop over all optimizers at once
|
|
|
|
optimizers = self.get_optimizers_iterable()
|
|
|
|
if not self.automatic_optimization:
|
|
|
|
optimizers = [optimizers[0]]
|
|
|
|
return optimizers
|
|
|
|
|
|
|
|
def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer):
|
|
|
|
# set split_idx to trainer for tracking
|
|
|
|
self.trainer.split_idx = split_idx
|
|
|
|
|
|
|
|
# make sure only the gradients of the current optimizer's parameters are calculated
|
|
|
|
# in the training step to prevent dangling gradients in multiple-optimizer setup.
|
|
|
|
if self.automatic_optimization and len(self.trainer.optimizers) > 1:
|
|
|
|
model = self.trainer.get_model()
|
|
|
|
model.toggle_optimizer(optimizer, opt_idx)
|
|
|
|
|
|
|
|
# use to track metrics internally
|
2020-11-05 22:27:04 +00:00
|
|
|
self.trainer.logger_connector.on_train_split_start(split_idx, opt_idx, split_batch)
|
2020-11-02 20:51:43 +00:00
|
|
|
|
|
|
|
def update_running_loss(self):
|
|
|
|
accumulated_loss = self.accumulated_loss.mean()
|
|
|
|
|
|
|
|
if accumulated_loss is not None:
|
|
|
|
# calculate running loss for display
|
|
|
|
self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches)
|
|
|
|
|
|
|
|
# reset for next set of accumulated grads
|
|
|
|
self.accumulated_loss.reset()
|