[fix] Add barriers before and after setup hook is run (#7202)
* Update data_connector.py * move-barrier * Update trainer.py * Update ddp.py * changelog * Spacing Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
This commit is contained in:
parent
f920ba29f2
commit
bab7225507
|
@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Added
|
||||
|
||||
- Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202))
|
||||
|
||||
|
||||
- Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942))
|
||||
|
||||
|
||||
|
|
|
@ -282,7 +282,7 @@ class DDPPlugin(ParallelPlugin):
|
|||
self.cluster_environment.teardown()
|
||||
|
||||
def barrier(self, *args, **kwargs):
|
||||
if torch_distrib.is_initialized():
|
||||
if torch_distrib.is_available() and torch_distrib.is_initialized():
|
||||
torch_distrib.barrier()
|
||||
|
||||
def broadcast(self, obj: object, src: int = 0) -> object:
|
||||
|
|
|
@ -1114,6 +1114,8 @@ class Trainer(
|
|||
assert self.state.running, f"TrainerState: {self.state}"
|
||||
state = self._setup_state
|
||||
|
||||
self.accelerator.barrier("pre_setup")
|
||||
|
||||
if self.datamodule is not None:
|
||||
called = getattr(self.datamodule, f'has_setup_{state}')
|
||||
if not called:
|
||||
|
@ -1122,6 +1124,8 @@ class Trainer(
|
|||
self.setup(model, stage=state)
|
||||
model.setup(stage=state)
|
||||
|
||||
self.accelerator.barrier("post_setup")
|
||||
|
||||
def call_configure_sharded_model(self, model: LightningModule) -> None:
|
||||
# Call configure sharded model hook if accelerator requests. In some cases
|
||||
# we will not call the hook; the hook has initialized the sharded model for example.
|
||||
|
|
Loading…
Reference in New Issue