* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang

* fix tpu hang
This commit is contained in:
William Falcon 2020-07-04 22:52:49 -04:00 committed by GitHub
parent 39a6435726
commit 9924c76faa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 28 additions and 8 deletions

View File

@ -31,6 +31,7 @@ class LightningTemplateModel(LightningModule):
... optimizer_name='adam',
... data_root='./datasets',
... out_features=10,
... num_workers=4,
... hidden_dim=1000,
... )
>>> model = LightningTemplateModel(**params)
@ -44,11 +45,14 @@ class LightningTemplateModel(LightningModule):
optimizer_name: str = 'adam',
data_root: str = './datasets',
out_features: int = 10,
num_workers: int = 4,
hidden_dim: int = 1000,
**kwargs
):
# init superclass
super().__init__()
self.num_workers = num_workers
self.drop_prob = drop_prob
self.batch_size = batch_size
self.in_features = in_features
@ -150,13 +154,13 @@ class LightningTemplateModel(LightningModule):
self.mnist_test = MNIST(self.data_root, train=False, download=False, transform=transform)
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)
return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)
def val_dataloader(self):
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)
def test_dataloader(self):
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)
@staticmethod
def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover
@ -175,6 +179,7 @@ class LightningTemplateModel(LightningModule):
parser.add_argument('--hidden_dim', default=50000, type=int)
parser.add_argument('--drop_prob', default=0.2, type=float)
parser.add_argument('--learning_rate', default=0.001, type=float)
parser.add_argument('--num_workers', default=4, type=int)
# data
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)

View File

@ -191,7 +191,6 @@ class ModelHooks(Module):
def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
if NATIVE_AMP_AVALAIBLE:
scaled_loss = self.trainer.scaler.scale(unscaled_loss)
else:
scaled_loss = amp.scale_loss(unscaled_loss, optimizer)

View File

@ -436,7 +436,6 @@ class TrainerDDPMixin(ABC):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'
# import pdb; pdb.set_trace()
# start process
proc = subprocess.Popen(command, env=env_copy)
self.interactive_ddp_procs.append(proc)

View File

@ -636,6 +636,7 @@ class TrainerTrainLoopMixin(ABC):
# ------------------------------
batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics)
batch_log_metrics.append(opt_closure_result.training_step_output.log_metrics)
self.add_progress_bar_metrics(opt_closure_result.training_step_output.pbar_on_batch_end)
# track hiddens
@ -741,6 +742,7 @@ class TrainerTrainLoopMixin(ABC):
model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure,
using_lbfgs=True)
# when using 16-bit
else:
native_amp = self.use_amp and NATIVE_AMP_AVALAIBLE
@ -796,6 +798,9 @@ class TrainerTrainLoopMixin(ABC):
# (if accumulate_grad_batches = 1 no effect)
closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches
# the loss will get scaled for amp. avoid any modifications to it
untouched_loss = closure_loss.detach().clone()
# backward pass
model_ref = self.get_model()
with self.profiler.profile('model_backward'):
@ -834,7 +839,7 @@ class TrainerTrainLoopMixin(ABC):
model_ref.on_after_backward()
result = AttributeDict(
loss=closure_loss,
loss=untouched_loss,
training_step_output=training_step_output,
training_step_output_for_epoch_end=training_step_output_for_epoch_end,
hiddens=training_step_output.hiddens,

View File

@ -134,7 +134,10 @@ class DeterministicModel(LightningModule):
return torch.optim.Adam(self.parameters(), lr=0)
def backward(self, trainer, loss, optimizer, optimizer_idx):
assert loss == 171.0
if self.trainer.precision == 16:
assert loss > 171 * 1000
else:
assert loss == 171.0
loss.backward()

View File

@ -1,7 +1,11 @@
from pytorch_lightning import Trainer
from tests.base.deterministic_model import DeterministicModel
import pytest
import torch
@pytest.mark.spawn
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_training_step_dict(tmpdir):
"""
Tests that only training_step can be used
@ -13,6 +17,8 @@ def test_training_step_dict(tmpdir):
trainer = Trainer(
default_root_dir=tmpdir,
fast_dev_run=True,
precision=16,
gpus=1,
weights_summary=None,
)
trainer.fit(model)
@ -33,13 +39,16 @@ def test_training_step_dict(tmpdir):
train_step_out = out.training_step_output_for_epoch_end
pbar_metrics = train_step_out['progress_bar']
assert 'loss' in train_step_out
assert 'log' in train_step_out
assert 'progress_bar' in train_step_out
assert train_step_out['train_step_test'] == 549
assert pbar_metrics['pbar_acc1'] == 17.0
assert pbar_metrics['pbar_acc2'] == 19.0
# make sure the optimizer closure returns the correct things
opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def training_step_with_step_end(tmpdir):
"""