From 9924c76faa7789294811a27c392ba6b33e07f3f1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 4 Jul 2020 22:52:49 -0400 Subject: [PATCH] Amp2 (#2505) * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang --- pl_examples/models/lightning_template.py | 11 ++++++++--- pytorch_lightning/core/hooks.py | 1 - pytorch_lightning/trainer/distrib_data_parallel.py | 1 - pytorch_lightning/trainer/training_loop.py | 7 ++++++- tests/base/deterministic_model.py | 5 ++++- tests/trainer/test_trainer_steps.py | 11 ++++++++++- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/pl_examples/models/lightning_template.py b/pl_examples/models/lightning_template.py index c79405c04b..0cbf99bca1 100644 --- a/pl_examples/models/lightning_template.py +++ b/pl_examples/models/lightning_template.py @@ -31,6 +31,7 @@ class LightningTemplateModel(LightningModule): ... optimizer_name='adam', ... data_root='./datasets', ... out_features=10, + ... num_workers=4, ... hidden_dim=1000, ... ) >>> model = LightningTemplateModel(**params) @@ -44,11 +45,14 @@ class LightningTemplateModel(LightningModule): optimizer_name: str = 'adam', data_root: str = './datasets', out_features: int = 10, + num_workers: int = 4, hidden_dim: int = 1000, **kwargs ): # init superclass super().__init__() + + self.num_workers = num_workers self.drop_prob = drop_prob self.batch_size = batch_size self.in_features = in_features @@ -150,13 +154,13 @@ class LightningTemplateModel(LightningModule): self.mnist_test = MNIST(self.data_root, train=False, download=False, transform=transform) def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4) + return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers) def val_dataloader(self): - return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4) + return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers) def test_dataloader(self): - return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4) + return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers) @staticmethod def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover @@ -175,6 +179,7 @@ class LightningTemplateModel(LightningModule): parser.add_argument('--hidden_dim', default=50000, type=int) parser.add_argument('--drop_prob', default=0.2, type=float) parser.add_argument('--learning_rate', default=0.001, type=float) + parser.add_argument('--num_workers', default=4, type=int) # data parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 9c329def39..223cf6925d 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -191,7 +191,6 @@ class ModelHooks(Module): def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx): if NATIVE_AMP_AVALAIBLE: scaled_loss = self.trainer.scaler.scale(unscaled_loss) - else: scaled_loss = amp.scale_loss(unscaled_loss, optimizer) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 46dcdbbb0a..3b4732ead5 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -436,7 +436,6 @@ class TrainerDDPMixin(ABC): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' - # import pdb; pdb.set_trace() # start process proc = subprocess.Popen(command, env=env_copy) self.interactive_ddp_procs.append(proc) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index aa41087d29..5340c57581 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -636,6 +636,7 @@ class TrainerTrainLoopMixin(ABC): # ------------------------------ batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics) batch_log_metrics.append(opt_closure_result.training_step_output.log_metrics) + self.add_progress_bar_metrics(opt_closure_result.training_step_output.pbar_on_batch_end) # track hiddens @@ -741,6 +742,7 @@ class TrainerTrainLoopMixin(ABC): model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure, using_lbfgs=True) + # when using 16-bit else: native_amp = self.use_amp and NATIVE_AMP_AVALAIBLE @@ -796,6 +798,9 @@ class TrainerTrainLoopMixin(ABC): # (if accumulate_grad_batches = 1 no effect) closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches + # the loss will get scaled for amp. avoid any modifications to it + untouched_loss = closure_loss.detach().clone() + # backward pass model_ref = self.get_model() with self.profiler.profile('model_backward'): @@ -834,7 +839,7 @@ class TrainerTrainLoopMixin(ABC): model_ref.on_after_backward() result = AttributeDict( - loss=closure_loss, + loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end=training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index d4c1bff648..caeb9e882a 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -134,7 +134,10 @@ class DeterministicModel(LightningModule): return torch.optim.Adam(self.parameters(), lr=0) def backward(self, trainer, loss, optimizer, optimizer_idx): - assert loss == 171.0 + if self.trainer.precision == 16: + assert loss > 171 * 1000 + else: + assert loss == 171.0 loss.backward() diff --git a/tests/trainer/test_trainer_steps.py b/tests/trainer/test_trainer_steps.py index 44bdd2271c..05627f8e1e 100644 --- a/tests/trainer/test_trainer_steps.py +++ b/tests/trainer/test_trainer_steps.py @@ -1,7 +1,11 @@ from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel +import pytest +import torch +@pytest.mark.spawn +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_training_step_dict(tmpdir): """ Tests that only training_step can be used @@ -13,6 +17,8 @@ def test_training_step_dict(tmpdir): trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, + precision=16, + gpus=1, weights_summary=None, ) trainer.fit(model) @@ -33,13 +39,16 @@ def test_training_step_dict(tmpdir): train_step_out = out.training_step_output_for_epoch_end pbar_metrics = train_step_out['progress_bar'] - assert 'loss' in train_step_out assert 'log' in train_step_out assert 'progress_bar' in train_step_out assert train_step_out['train_step_test'] == 549 assert pbar_metrics['pbar_acc1'] == 17.0 assert pbar_metrics['pbar_acc2'] == 19.0 + # make sure the optimizer closure returns the correct things + opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) + assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + def training_step_with_step_end(tmpdir): """