From 51a5cc36e3e7e5f0ed99371c996797de7506a549 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 11:50:02 -0400 Subject: [PATCH 1/7] added checkpoint test on cpu --- tests/test_models.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index c20c927d6a..2dc05489de 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -43,6 +43,59 @@ def test_dp_output_reduce(): assert reduced['b']['c'] == out['b']['c'] +def test_cpu_slurm_managed(): + """ + SLURM checkpointing works + :return: + """ + hparams = get_hparams() + model = LightningTestModel(hparams) + + trainer_options = dict( + max_nb_epochs=1, + ) + + save_dir = init_save_dir() + + # exp file to get meta + exp = get_exp(False) + exp.argparse(hparams) + exp.save() + + # exp file to get weights + checkpoint = ModelCheckpoint(save_dir) + + # add these to the trainer options + trainer_options['checkpoint_callback'] = checkpoint + trainer_options['experiment'] = exp + + # fit model + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + + # correct result and ok accuracy + assert result == 1, 'amp + ddp model failed to complete' + + # test model loading with a map_location + pretrained_model = load_model(exp, save_dir, True) + + # test model preds + run_prediction(model.test_dataloader, pretrained_model) + + trainer.model = pretrained_model + trainer.optimizers = pretrained_model.configure_optimizers() + + # test HPC loading / saving + trainer.hpc_save(save_dir, exp) + trainer.hpc_load(save_dir, on_gpu=False) + + # test freeze on gpu + model.freeze() + model.unfreeze() + + clear_save_dir() + + def test_amp_gpu_ddp_slurm_managed(): """ Make sure DDP + AMP work From 2ee8f157ce851eaf5ae9597397aae22d57429a13 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 11:51:25 -0400 Subject: [PATCH 2/7] added checkpoint test on cpu --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 2dc05489de..fd2140092f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -45,7 +45,7 @@ def test_dp_output_reduce(): def test_cpu_slurm_managed(): """ - SLURM checkpointing works + Verify model save/load/checkpoint on CPU :return: """ hparams = get_hparams() From 1a835969a6d76e31aed12e4abdbf0b9267b8f01c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 12:14:58 -0400 Subject: [PATCH 3/7] added saving tests to cpu --- pytorch_lightning/root_module/model_saving.py | 4 +- tests/test_models.py | 39 ++++++++++++------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py index 3d242adadc..2537785175 100644 --- a/pytorch_lightning/root_module/model_saving.py +++ b/pytorch_lightning/root_module/model_saving.py @@ -86,7 +86,7 @@ class TrainerIO(object): # -------------------- # HPC IO # -------------------- - def enable_auto_hpc_walltime_manager(self): # pragma: no cover + def enable_auto_hpc_walltime_manager(self): if self.cluster is None: return @@ -157,6 +157,8 @@ class TrainerIO(object): # do the actual save torch.save(checkpoint_dict, filepath) + return filepath + def hpc_load(self, folderpath, on_gpu): filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) diff --git a/tests/test_models.py b/tests/test_models.py index fd2140092f..455111d460 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -3,7 +3,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.examples.new_project_templates.lightning_module_template import LightningTemplateModel from pytorch_lightning.testing_models.lm_test_module import LightningTestModel from argparse import Namespace -from test_tube import Experiment +from test_tube import Experiment, SlurmCluster from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping from pytorch_lightning.utils.debugging import MisconfigurationException from pytorch_lightning.root_module import memory @@ -43,7 +43,7 @@ def test_dp_output_reduce(): assert reduced['b']['c'] == out['b']['c'] -def test_cpu_slurm_managed(): +def test_cpu_slurm_saving_loading(): """ Verify model save/load/checkpoint on CPU :return: @@ -51,10 +51,6 @@ def test_cpu_slurm_managed(): hparams = get_hparams() model = LightningTestModel(hparams) - trainer_options = dict( - max_nb_epochs=1, - ) - save_dir = init_save_dir() # exp file to get meta @@ -62,20 +58,28 @@ def test_cpu_slurm_managed(): exp.argparse(hparams) exp.save() - # exp file to get weights - checkpoint = ModelCheckpoint(save_dir) - - # add these to the trainer options - trainer_options['checkpoint_callback'] = checkpoint - trainer_options['experiment'] = exp + trainer_options = dict( + max_nb_epochs=1, + cluster=SlurmCluster(), + experiment=exp, + checkpoint_callback=ModelCheckpoint(save_dir) + ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + real_global_step = trainer.global_step - # correct result and ok accuracy + # traning complete assert result == 1, 'amp + ddp model failed to complete' + # test saving checkpoint + ckpt_test = os.path.join(save_dir, 'test.ckpt') + trainer.save_checkpoint(ckpt_test) + + # test registering a save function + trainer.enable_auto_hpc_walltime_manager() + # test model loading with a map_location pretrained_model = load_model(exp, save_dir, True) @@ -85,9 +89,14 @@ def test_cpu_slurm_managed(): trainer.model = pretrained_model trainer.optimizers = pretrained_model.configure_optimizers() - # test HPC loading / saving - trainer.hpc_save(save_dir, exp) + # test HPC saving + saved_filepath = trainer.hpc_save(save_dir, exp) + assert os.path.exists(saved_filepath) + + # test HPC loading + trainer.global_step = 20000000 trainer.hpc_load(save_dir, on_gpu=False) + assert trainer.global_step == real_global_step and trainer.global_step != 20000000 # test freeze on gpu model.freeze() From 84f03a133512a5ae43253b1fad66b333bea910b3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 12:29:19 -0400 Subject: [PATCH 4/7] added saving tests to cpu --- tests/test_models.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index 455111d460..5aa452bc8f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -8,11 +8,13 @@ from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping from pytorch_lightning.utils.debugging import MisconfigurationException from pytorch_lightning.root_module import memory from pytorch_lightning.models.trainer import reduce_distributed_output +from pytorch_lightning.root_module import model_saving import numpy as np import warnings import torch import os import shutil +import pdb SEED = 2334 torch.manual_seed(SEED) @@ -22,6 +24,25 @@ np.random.seed(SEED) # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ +def test_loading_meta_tags(): + hparams = get_hparams() + + save_dir = init_save_dir() + + # save tags + exp = get_exp(False) + exp.tag({'some_str':'a_str', 'an_int': 1, 'a_float': 2.0}) + exp.argparse(hparams) + exp.save() + + # load tags + tags_path = exp.get_data_path(exp.name, exp.version) + '/meta_tags.csv' + tags = model_saving.load_hparams_from_tags_csv(tags_path) + + pdb.set_trace() + assert len(tags) >=3 + + def test_dp_output_reduce(): # test identity when we have a single gpu From fbc1bbd1619189db90bf5cba09d03ee27a059a55 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 12:31:26 -0400 Subject: [PATCH 5/7] added saving tests to cpu --- tests/test_models.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 5aa452bc8f..ff5b4037a0 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -27,8 +27,6 @@ np.random.seed(SEED) def test_loading_meta_tags(): hparams = get_hparams() - save_dir = init_save_dir() - # save tags exp = get_exp(False) exp.tag({'some_str':'a_str', 'an_int': 1, 'a_float': 2.0}) @@ -39,8 +37,9 @@ def test_loading_meta_tags(): tags_path = exp.get_data_path(exp.name, exp.version) + '/meta_tags.csv' tags = model_saving.load_hparams_from_tags_csv(tags_path) - pdb.set_trace() - assert len(tags) >=3 + assert tags['batch_size'] == 32 and tags['hidden_dim'] == 1000 + + clear_save_dir() def test_dp_output_reduce(): From a374a7ea00b1ca384413404642a60f71427c31dd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 12:33:35 -0400 Subject: [PATCH 6/7] added saving tests to cpu --- tests/test_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index ff5b4037a0..f2faad0d5b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -27,6 +27,8 @@ np.random.seed(SEED) def test_loading_meta_tags(): hparams = get_hparams() + save_dir = init_save_dir() + # save tags exp = get_exp(False) exp.tag({'some_str':'a_str', 'an_int': 1, 'a_float': 2.0}) @@ -41,7 +43,6 @@ def test_loading_meta_tags(): clear_save_dir() - def test_dp_output_reduce(): # test identity when we have a single gpu From 84edf35f3383c8481395f9cada6b248c639e48a3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 12:35:28 -0400 Subject: [PATCH 7/7] added saving tests to cpu --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index f2faad0d5b..e5781d3211 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,7 +39,7 @@ def test_loading_meta_tags(): tags_path = exp.get_data_path(exp.name, exp.version) + '/meta_tags.csv' tags = model_saving.load_hparams_from_tags_csv(tags_path) - assert tags['batch_size'] == 32 and tags['hidden_dim'] == 1000 + assert tags.batch_size == 32 and tags.hidden_dim == 1000 clear_save_dir()