From b1fdde5daf80243d81f4d875ffd2805a2401462f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 27 Jun 2019 11:22:13 -0400 Subject: [PATCH] prog bar option --- docs/Trainer/Training Loop.md | 34 +++++++++++++++++++++++++++ docs/Trainer/index.md | 36 +---------------------------- pytorch_lightning/models/trainer.py | 14 +++++------ pytorch_lightning/trainer_main.py | 2 +- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/docs/Trainer/Training Loop.md b/docs/Trainer/Training Loop.md index e69de29bb2..be7eefd283 100644 --- a/docs/Trainer/Training Loop.md +++ b/docs/Trainer/Training Loop.md @@ -0,0 +1,34 @@ +The asdf + +--- +#### Accumulated gradients +Accumulated gradients runs K small batches of size N before doing a backwards pass. The effect is a large effective batch size of size KxN. + +``` {.python} +# default 1 (ie: no accumulated grads) +trainer = Trainer(accumulate_grad_batches=1) +``` + +--- +#### Check GPU usage +Lightning automatically logs gpu usage to the test tube logs. It'll only do it at the metric logging interval, so it doesn't slow down training. + +--- +#### Check which gradients are nan +This option prints a list of tensors with nan gradients. +``` {.python} +trainer = Trainer(check_grad_nans=False) +``` + +--- +#### Check validation every n epochs +If you have a small dataset you might want to check validation every n epochs +``` {.python} +trainer = Trainer(check_val_every_n_epoch=1) +``` + +--- +#### Display metrics in progress bar +``` {.python} +trainer = Trainer(progress_bar=True) +``` \ No newline at end of file diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 4e51b13c9e..ac04f17c50 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -16,41 +16,6 @@ trainer.fit(model) But of course the fun is in all the advanced things it can do: -``` {.python} -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from test_tube import Experiment, SlurmCluster - -trainer = Trainer( - experiment=Experiment, - checkpoint_callback=ModelCheckpoint, - early_stop_callback=EarlyStopping, - cluster=SlurmCluster, - process_position=0, - current_gpu_name=0, - gpus=None, - enable_tqdm=True, - overfit_pct=0.0, - track_grad_norm=-1, - check_val_every_n_epoch=1, - fast_dev_run=False, - accumulate_grad_batches=1, - enable_early_stop=True, max_nb_epochs=5, min_nb_epochs=1, - train_percent_check=1.0, - val_percent_check=1.0, - test_percent_check=1.0, - val_check_interval=0.95, - log_save_interval=1, add_log_row_interval=1, - lr_scheduler_milestones=None, - use_amp=False, - check_grad_nans=False, - amp_level='O2', - nb_sanity_val_steps=5): -) -``` - - -Things you can do with the trainer module: - **Training loop** - Accumulate gradients @@ -58,6 +23,7 @@ Things you can do with the trainer module: - Check which gradients are nan - Check validation every n epochs - Display metrics in progress bar +- Display the parameter count by layer - Force training for min or max epochs - Inspect gradient norms - Learning rate annealing diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 984dcb3836..b2f9bdc13d 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -37,7 +37,7 @@ class Trainer(TrainerIO): process_position=0, current_gpu_name=0, gpus=None, - enable_tqdm=True, + progress_bar=True, overfit_pct=0.0, track_grad_norm=-1, check_val_every_n_epoch=1, @@ -58,7 +58,7 @@ class Trainer(TrainerIO): self.track_grad_norm = track_grad_norm self.fast_dev_run = fast_dev_run self.on_gpu = gpus is not None and torch.cuda.is_available() - self.enable_tqdm = enable_tqdm + self.progress_bar = progress_bar self.experiment = experiment self.exp_save_path = experiment.get_data_path(experiment.name, experiment.version) self.cluster = cluster @@ -206,7 +206,7 @@ class Trainer(TrainerIO): outputs.append(output) # batch done - if self.enable_tqdm and self.prog_bar is not None: + if self.progress_bar and self.prog_bar is not None: self.prog_bar.update(1) # give model a chance to do something with the outputs @@ -307,7 +307,7 @@ class Trainer(TrainerIO): self.batch_loss_value = 0 # accumulated grads # init progbar when requested - if self.enable_tqdm: + if self.progress_bar: self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position) for batch_nb, data_batch in enumerate(self.tng_dataloader): @@ -403,7 +403,7 @@ class Trainer(TrainerIO): if response == -1: return -1 - if self.enable_tqdm: + if self.progress_bar: self.prog_bar.update(1) # forward pass @@ -453,7 +453,7 @@ class Trainer(TrainerIO): self.avg_loss = np.mean(self.running_loss[-100:]) # update progbar - if self.enable_tqdm: + if self.progress_bar: # add model specific metrics tqdm_metrics = self.__tng_tqdm_dic self.prog_bar.set_postfix(**tqdm_metrics) @@ -495,7 +495,7 @@ class Trainer(TrainerIO): print(e) print(traceback.print_exc()) - if self.enable_tqdm: + if self.progress_bar: # add model specific metrics tqdm_metrics = self.__tng_tqdm_dic self.prog_bar.set_postfix(**tqdm_metrics) diff --git a/pytorch_lightning/trainer_main.py b/pytorch_lightning/trainer_main.py index d10e148809..8389b5ec48 100644 --- a/pytorch_lightning/trainer_main.py +++ b/pytorch_lightning/trainer_main.py @@ -97,7 +97,7 @@ def main(hparams, cluster, results_dict): experiment=exp, on_gpu=on_gpu, cluster=cluster, - enable_tqdm=hparams.enable_tqdm, + progress_bar=hparams.enable_tqdm, overfit_pct=hparams.overfit, track_grad_norm=hparams.track_grad_norm, fast_dev_run=hparams.fast_dev_run,