From b0fae555718a14fac66b70b5f5e20e0b8ed5d65b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 7 Aug 2019 07:42:14 -0400 Subject: [PATCH] fixed restore location --- pytorch_lightning/models/trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 7aa1c037a7..cbec522817 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -621,9 +621,6 @@ class Trainer(TrainerIO): ref_model.trainer = self ref_model.experiment = self.experiment - # restore training and model - self.restore_state_if_existing_checkpoint() - # run tiny validation to make sure program won't crash during val _ = self.validate(model, self.val_dataloader, max_batches=self.nb_sanity_val_steps) @@ -635,8 +632,12 @@ class Trainer(TrainerIO): # if cluster resets state, the model will update with the saved weights self.model = model + # restore training and model before hpc call + self.restore_state_if_existing_checkpoint() + # enable cluster checkpointing # also restores training state + # hpc checkpoint overrides any other checkpoints loaded before if self.cluster is not None: # pragma: no cover self.enable_auto_hpc_walltime_manager()