proc 0 only for save hpc. all procs for hpc load

This commit is contained in:
William Falcon 2019-08-01 16:19:04 -04:00
parent 00e851958c
commit ef6d5a412c
2 changed files with 11 additions and 8 deletions

View File

@ -612,7 +612,7 @@ class Trainer(TrainerIO):
# enable cluster checkpointing
# also restores training state
if self.cluster is not None and self.proc_rank == 0: # pragma: no cover
if self.cluster is not None: # pragma: no cover
self.enable_auto_hpc_walltime_manager()
# ---------------------------

View File

@ -102,6 +102,8 @@ class TrainerIO(object):
return
# allow test tube to handle model check pointing automatically
# only if proc 0 so we don't trigger world_size resubmits
if self.proc_rank == 0:
self.cluster.set_checkpoint_save_function(
self.hpc_save,
kwargs={
@ -109,6 +111,7 @@ class TrainerIO(object):
'experiment': self.experiment
}
)
self.cluster.set_checkpoint_load_function(
self.hpc_load,
kwargs={