From e44644e4ba983b12f32767f7a9c61e92c25dd59c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 27 Jun 2019 13:58:13 -0400 Subject: [PATCH] added val loop options --- docs/Trainer/Logging.md | 39 +++++++++++---- docs/Trainer/index.md | 77 +++++++++++++++-------------- pytorch_lightning/models/trainer.py | 2 +- 3 files changed, 71 insertions(+), 47 deletions(-) diff --git a/docs/Trainer/Logging.md b/docs/Trainer/Logging.md index e575b833d0..1f57439ab2 100644 --- a/docs/Trainer/Logging.md +++ b/docs/Trainer/Logging.md @@ -9,15 +9,6 @@ trainer = Trainer(progress_bar=True) ``` - ---- -#### Print which gradients are nan -This option prints a list of tensors with nan gradients. -``` {.python} -# DEFAULT -trainer = Trainer(print_nan_grads=False) -``` - --- #### Process position When running multiple models on the same machine we want to decide which progress bar to use. @@ -29,3 +20,33 @@ trainer = Trainer(process_position=0) # if this is the second model on the node, show the second progress bar below trainer = Trainer(process_position=1) ``` + +--- +#### Print which gradients are nan +This option prints a list of tensors with nan gradients. +``` {.python} +# DEFAULT +trainer = Trainer(print_nan_grads=False) +``` + +--- +#### Save a snapshot of all hyperparameters +Whenever you call .save() on the test-tube experiment it logs all the hyperparameters in current use. +Give lightning a test-tube Experiment object to automate this for you. + +--- +#### Log metric row every k batches +Every k batches lightning will make an entry in the metrics log +``` {.python} +# DEFAULT (ie: save a .csv log file every 100 batches) +trainer = Trainer(add_log_row_interval=10) +``` + +--- +#### Write logs file to csv every k batches +Every k batches, lightning will write the new logs to disk +``` {.python} +# DEFAULT (ie: save a .csv log file every 100 batches) +trainer = Trainer(log_save_interval=100) +``` + diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index ff5a96c01a..f30684db91 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -16,6 +16,46 @@ trainer.fit(model) But of course the fun is in all the advanced things it can do: + +**Checkpointing** + +- Model saving +- Model loading + +**Computing cluster (SLURM)** + +- Automatic checkpointing +- Automatic saving, loading +- Running grid search on a cluster +- Walltime auto-resubmit + +**Debugging** + +- [Fast dev run](Debugging/#fast-dev-run) +- [Inspect gradient norms](Debugging/#inspect-gradient-norms) +- [Log GPU usage](Debugging/#Log-gpu-usage) +- [Make model overfit on subset of data](Debugging/#make-model-overfit-on-subset-of-data) +- [Print the parameter count by layer](Debugging/#print-the-parameter-count-by-layer) +- [Pring which gradients are nan](Debugging/#print-which-gradients-are-nan) + + +**Distributed training** + +- 16-bit mixed precision +- Single-gpu +- Multi-gpu +- Multi-node + +**Experiment Logging** + +- [Display metrics in progress bar](Logging/#display-metrics-in-progress-bar) +- Log arbitrary metrics +- [Process position](Logging/#process-position) +- [Write logs file to csv every k batches](Logging/#write-logs-file-to-csv-every-k-batches) +- [Log metric row every k batches](Logging/#log-metric-row-every-k-batches) +- Save a snapshot of all hyperparameters +- Save a snapshot of the code for a particular model run + **Training loop** - [Accumulate gradients](Training%20Loop/#accumulated-gradients) @@ -32,40 +72,3 @@ But of course the fun is in all the advanced things it can do: - [Set how much of the test set to check](Validation%20Loop/#set-how-much-of-the-test-set-to-check) - [Set validation check frequency within 1 training epoch](Validation%20Loop/#set-validation-check-frequency-within-1-training-epoch) - [Set the number of validation sanity steps](Validation%20Loop/#set-the-number-of-validation-sanity-steps) - -**Debugging** - -- [Fast dev run](Debugging/#fast-dev-run) -- [Inspect gradient norms](Debugging/#inspect-gradient-norms) -- [Log GPU usage](Debugging/#Log-gpu-usage) -- [Make model overfit on subset of data](Debugging/#make-model-overfit-on-subset-of-data) -- [Print the parameter count by layer](Debugging/#print-the-parameter-count-by-layer) -- [Pring which gradients are nan](Debugging/#print-which-gradients-are-nan) - - -**Experiment Logging** - -- [Display metrics in progress bar](Logging/#display-metrics-in-progress-bar) -- Log arbitrary metrics -- [Process position](Logging/#process-position) -- Save a snapshot of all hyperparameters -- Save a snapshot of the code for a particular model run - -**Distributed training** - -- 16-bit mixed precision -- Single-gpu -- Multi-gpu -- Multi-node - -**Checkpointing** - -- Model saving -- Model loading - -**Computing cluster (SLURM)** - -- Automatic checkpointing -- Automatic saving, loading -- Running grid search on a cluster -- Walltime auto-resubmit \ No newline at end of file diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 9cdb7d782b..059eb09abd 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -45,7 +45,7 @@ class Trainer(TrainerIO): accumulate_grad_batches=1, enable_early_stop=True, max_nb_epochs=1000, min_nb_epochs=1, train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, val_check_interval=0.95, - log_save_interval=1, add_log_row_interval=1, + log_save_interval=100, add_log_row_interval=10, lr_scheduler_milestones=None, use_amp=False, print_nan_grads=False,