removed reduce on non-loss outputs from dp (#78)

* removed reduce on non-loss outputs from dp

* fixed val reduce

* fixed val reduce

* fixed val reduce

* fixed val reduce
This commit is contained in:
William Falcon 2019-08-08 12:06:29 -04:00 committed by GitHub
parent fcea3971a8
commit 8cd764a151
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 9 deletions

View File

@ -151,12 +151,23 @@ class LightningTemplateModel(LightningModule):
val_loss_mean = 0
val_acc_mean = 0
for output in outputs:
val_loss_mean += output['val_loss']
val_acc_mean += output['val_acc']
val_loss = output['val_loss']
# reduce manually when using dp
if self.trainer.use_dp:
val_loss = torch.mean(val_loss)
val_loss_mean += val_loss
# reduce manually when using dp
val_acc = output['val_acc']
if self.trainer.use_dp:
val_acc_mean = torch.mean(val_acc)
val_acc_mean += val_acc_mean
val_loss_mean /= len(outputs)
val_acc_mean /= len(outputs)
tqdm_dic = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()}
tqdm_dic = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean}
return tqdm_dic
# ---------------------

View File

@ -399,13 +399,14 @@ class Trainer(TrainerIO):
output = model(data_batch, batch_i)
elif self.use_dp:
output = model(data_batch, batch_i)
output = reduce_distributed_output(output, len(self.data_parallel_device_ids))
elif self.single_gpu:
# put inputs on gpu manually
gpu_id = self.data_parallel_device_ids[0]
for i, x in enumerate(data_batch):
if isinstance(x, torch.Tensor):
data_batch[i] = x.cuda(gpu_id)
# do non dp, ddp step
output = model.validation_step(data_batch, batch_i)
else:
@ -862,7 +863,6 @@ We recommend you switch to ddp if you want to use amp
output = self.model(data_batch, batch_nb)
elif self.use_dp:
output = self.model(data_batch, batch_nb)
output = reduce_distributed_output(output, len(self.data_parallel_device_ids))
elif self.single_gpu:
gpu_id = self.data_parallel_device_ids[0]
for i, x in enumerate(data_batch):
@ -874,7 +874,14 @@ We recommend you switch to ddp if you want to use amp
output = self.model.training_step(data_batch, batch_nb)
try:
model_specific_tqdm_metrics_dic = output['prog']
prog_output = output['prog']
# reduce prog metrics for tqdm when using dp
if self.use_dp:
nb_gpus = len(self.data_parallel_device_ids)
prog_output = reduce_distributed_output(prog_output, nb_gpus)
model_specific_tqdm_metrics_dic = prog_output
except Exception:
model_specific_tqdm_metrics_dic = {}
@ -886,6 +893,10 @@ We recommend you switch to ddp if you want to use amp
if type(output) is torch.Tensor:
loss = output
# when using dp need to reduce the loss
if self.use_dp:
loss = reduce_distributed_output(loss, len(self.data_parallel_device_ids))
self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic)
# backward pass
@ -968,12 +979,12 @@ We recommend you switch to ddp if you want to use amp
# use full val set on end of epoch
# use a small portion otherwise
max_batches = None if not self.fast_dev_run else 1
model_specific_tqdm_metrics_dic = self.validate(
validation_results = self.validate(
self.model,
self.val_dataloader,
max_batches
)
self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic)
self.__add_tqdm_metrics(validation_results)
# hook
if self.__is_function_implemented('on_post_performance_check'):