Amp2 (#2505)
* fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang * fix tpu hang
This commit is contained in:
parent
39a6435726
commit
9924c76faa
|
@ -31,6 +31,7 @@ class LightningTemplateModel(LightningModule):
|
|||
... optimizer_name='adam',
|
||||
... data_root='./datasets',
|
||||
... out_features=10,
|
||||
... num_workers=4,
|
||||
... hidden_dim=1000,
|
||||
... )
|
||||
>>> model = LightningTemplateModel(**params)
|
||||
|
@ -44,11 +45,14 @@ class LightningTemplateModel(LightningModule):
|
|||
optimizer_name: str = 'adam',
|
||||
data_root: str = './datasets',
|
||||
out_features: int = 10,
|
||||
num_workers: int = 4,
|
||||
hidden_dim: int = 1000,
|
||||
**kwargs
|
||||
):
|
||||
# init superclass
|
||||
super().__init__()
|
||||
|
||||
self.num_workers = num_workers
|
||||
self.drop_prob = drop_prob
|
||||
self.batch_size = batch_size
|
||||
self.in_features = in_features
|
||||
|
@ -150,13 +154,13 @@ class LightningTemplateModel(LightningModule):
|
|||
self.mnist_test = MNIST(self.data_root, train=False, download=False, transform=transform)
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)
|
||||
return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
|
||||
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
|
||||
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover
|
||||
|
@ -175,6 +179,7 @@ class LightningTemplateModel(LightningModule):
|
|||
parser.add_argument('--hidden_dim', default=50000, type=int)
|
||||
parser.add_argument('--drop_prob', default=0.2, type=float)
|
||||
parser.add_argument('--learning_rate', default=0.001, type=float)
|
||||
parser.add_argument('--num_workers', default=4, type=int)
|
||||
|
||||
# data
|
||||
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
|
||||
|
|
|
@ -191,7 +191,6 @@ class ModelHooks(Module):
|
|||
def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
|
||||
if NATIVE_AMP_AVALAIBLE:
|
||||
scaled_loss = self.trainer.scaler.scale(unscaled_loss)
|
||||
|
||||
else:
|
||||
scaled_loss = amp.scale_loss(unscaled_loss, optimizer)
|
||||
|
||||
|
|
|
@ -436,7 +436,6 @@ class TrainerDDPMixin(ABC):
|
|||
env_copy = os.environ.copy()
|
||||
env_copy['LOCAL_RANK'] = f'{local_rank}'
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
# start process
|
||||
proc = subprocess.Popen(command, env=env_copy)
|
||||
self.interactive_ddp_procs.append(proc)
|
||||
|
|
|
@ -636,6 +636,7 @@ class TrainerTrainLoopMixin(ABC):
|
|||
# ------------------------------
|
||||
batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics)
|
||||
batch_log_metrics.append(opt_closure_result.training_step_output.log_metrics)
|
||||
|
||||
self.add_progress_bar_metrics(opt_closure_result.training_step_output.pbar_on_batch_end)
|
||||
|
||||
# track hiddens
|
||||
|
@ -741,6 +742,7 @@ class TrainerTrainLoopMixin(ABC):
|
|||
model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure,
|
||||
using_lbfgs=True)
|
||||
|
||||
|
||||
# when using 16-bit
|
||||
else:
|
||||
native_amp = self.use_amp and NATIVE_AMP_AVALAIBLE
|
||||
|
@ -796,6 +798,9 @@ class TrainerTrainLoopMixin(ABC):
|
|||
# (if accumulate_grad_batches = 1 no effect)
|
||||
closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches
|
||||
|
||||
# the loss will get scaled for amp. avoid any modifications to it
|
||||
untouched_loss = closure_loss.detach().clone()
|
||||
|
||||
# backward pass
|
||||
model_ref = self.get_model()
|
||||
with self.profiler.profile('model_backward'):
|
||||
|
@ -834,7 +839,7 @@ class TrainerTrainLoopMixin(ABC):
|
|||
model_ref.on_after_backward()
|
||||
|
||||
result = AttributeDict(
|
||||
loss=closure_loss,
|
||||
loss=untouched_loss,
|
||||
training_step_output=training_step_output,
|
||||
training_step_output_for_epoch_end=training_step_output_for_epoch_end,
|
||||
hiddens=training_step_output.hiddens,
|
||||
|
|
|
@ -134,7 +134,10 @@ class DeterministicModel(LightningModule):
|
|||
return torch.optim.Adam(self.parameters(), lr=0)
|
||||
|
||||
def backward(self, trainer, loss, optimizer, optimizer_idx):
|
||||
assert loss == 171.0
|
||||
if self.trainer.precision == 16:
|
||||
assert loss > 171 * 1000
|
||||
else:
|
||||
assert loss == 171.0
|
||||
loss.backward()
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
from pytorch_lightning import Trainer
|
||||
from tests.base.deterministic_model import DeterministicModel
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
@pytest.mark.spawn
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_training_step_dict(tmpdir):
|
||||
"""
|
||||
Tests that only training_step can be used
|
||||
|
@ -13,6 +17,8 @@ def test_training_step_dict(tmpdir):
|
|||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
fast_dev_run=True,
|
||||
precision=16,
|
||||
gpus=1,
|
||||
weights_summary=None,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
@ -33,13 +39,16 @@ def test_training_step_dict(tmpdir):
|
|||
|
||||
train_step_out = out.training_step_output_for_epoch_end
|
||||
pbar_metrics = train_step_out['progress_bar']
|
||||
assert 'loss' in train_step_out
|
||||
assert 'log' in train_step_out
|
||||
assert 'progress_bar' in train_step_out
|
||||
assert train_step_out['train_step_test'] == 549
|
||||
assert pbar_metrics['pbar_acc1'] == 17.0
|
||||
assert pbar_metrics['pbar_acc2'] == 19.0
|
||||
|
||||
# make sure the optimizer closure returns the correct things
|
||||
opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
|
||||
assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
|
||||
|
||||
|
||||
def training_step_with_step_end(tmpdir):
|
||||
"""
|
||||
|
|
Loading…
Reference in New Issue