From acb4ebea56de4c467c5a8bc88bc4b6c6a3818b9c Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Thu, 26 Sep 2019 12:02:03 -0400
Subject: [PATCH] added docs for cluster grid search

---
 docs/Trainer/Distributed training.md | 107 +++++++++++++++++++++------
 1 file changed, 86 insertions(+), 21 deletions(-)

diff --git a/docs/Trainer/Distributed training.md b/docs/Trainer/Distributed training.md
index d1c52bd93e..f7d1f0a20f 100644
--- a/docs/Trainer/Distributed training.md	
+++ b/docs/Trainer/Distributed training.md	
@@ -8,6 +8,8 @@ None of the flags below require changing anything about your lightningModel defi
 Lightning supports two backends. DataParallel and DistributedDataParallel. Both can be used for single-node multi-GPU training.
 For multi-node training you must use DistributedDataParallel.   
 
+**Warning: Your cluster must have NCCL installed and you must load it when submitting your SLURM script**
+
 You can toggle between each mode by setting this flag.
 ``` {.python}
 # DEFAULT (when using single GPU or no GPUs)
@@ -117,39 +119,50 @@ trainer = Trainer(gpus=8, distributed_backend='ddp')
 
 ---
 #### Multi-node
-Multi-node training is easily done by specifying these flags.
+Multi-node training is easily done by specifying these flags. 
 ```python
 # train on 12*8 GPUs
 trainer = Trainer(gpus=8, nb_gpu_nodes=12, distributed_backend='ddp')
 ```
 
-In addition, make sure to set up your SLURM job correctly via the [SlurmClusterObject](https://williamfalcon.github.io/test-tube/hpc/SlurmCluster/). In particular, specify the number of tasks per node correctly.
+You must configure your job submission script correctly for the trainer to work. Here is an example
+script for the above trainer configuration.   
 
-```python
-cluster = SlurmCluster(
-    hyperparam_optimizer=test_tube.HyperOptArgumentParser(),
-    log_path='/some/path/to/save',
-)
+```sh
+#!/bin/bash -l
 
-# OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
-# which interface your nodes use for communication
-cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
+# SLURM SUBMIT SCRIPT
+#SBATCH --nodes=12
+#SBATCH --gres=gpu:8
+#SBATCH --ntasks-per-node=8
+#SBATCH --mem=0
+#SBATCH --time=0-02:00:00
 
-# see output of the NCCL connection process
-# NCCL is how the nodes talk to each other
-cluster.add_command('export NCCL_DEBUG=INFO')
+# activate conda env
+conda activate my_env
 
-# setting a master port here is a good idea.
-cluster.add_command('export MASTER_PORT=%r' % PORT)
+# REQUIRED: Load the latest NCCL version
+# the nccl version must match the cuda used to build your PyTorch distribution 
+# (ie: which instructions did you follow when installing PyTorch)
+# module load NCCL/2.4.7-1-cuda.10.0
 
-# good to load the latest NCCL version
-cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
+# -------------------------
+# OPTIONAL
+# -------------------------
+# debugging flags (optional)
+# export NCCL_DEBUG=INFO
+# export PYTHONFAULTHANDLER=1
 
-# configure cluster
-cluster.per_experiment_nb_nodes = 12 
-cluster.per_experiment_nb_gpus = 8
+# on your cluster you might need these:
+# set the network interface
+# export NCCL_SOCKET_IFNAME=^docker0,lo
+# -------------------------
 
-cluster.add_slurm_cmd(cmd='ntasks-per-node', value=8, comment='1 task per gpu')
+# random port between 12k and 20k
+export MASTER_PORT=$((12000 + RANDOM % 20000))
+
+# run script from above
+python my_main_file.py
 ```
 
 **NOTE:** When running in DDP mode, any errors in your code will show up as an NCCL issue.
@@ -169,6 +182,58 @@ dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
 dataloader = Dataloader(dataset, sampler=dist_sampler)
 ```
 
+#### Auto-slurm-job-submission
+Instead of manually building SLURM scripts, you can use the [SlurmCluster object](https://williamfalcon.github.io/test-tube/hpc/SlurmCluster/) to
+do this for you. The SlurmCluster can also run a grid search if you pass in a [HyperOptArgumentParser](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/).
+
+Here is an example where you run a grid search of 9 combinations of hyperparams.
+[The full examples are here](https://github.com/williamFalcon/pytorch-lightning/tree/master/examples/new_project_templates/multi_node_examples).
+```python
+# grid search 3 values of learning rate and 3 values of number of layers for your net
+# this generates 9 experiments (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
+parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
+parser.opt_list('--learning_rate', default=0.001, type=float, options=[1e-3, 1e-2, 1e-1], tunable=True)
+parser.opt_list('--layers', default=1, type=float, options=[16, 32, 64], tunable=True)
+hyperparams = parser.parse_args()
+
+# Slurm cluster submits 9 jobs, each with a set of hyperparams
+cluster = SlurmCluster(
+    hyperparam_optimizer=hyperparams,
+    log_path='/some/path/to/save',
+)
+
+# OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
+# which interface your nodes use for communication
+cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
+
+# see output of the NCCL connection process
+# NCCL is how the nodes talk to each other
+cluster.add_command('export NCCL_DEBUG=INFO')
+
+# setting a master port here is a good idea.
+cluster.add_command('export MASTER_PORT=%r' % PORT)
+
+# ************** DON'T FORGET THIS ***************
+# MUST load the latest NCCL version
+cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
+
+# configure cluster
+cluster.per_experiment_nb_nodes = 12 
+cluster.per_experiment_nb_gpus = 8
+
+cluster.add_slurm_cmd(cmd='ntasks-per-node', value=8, comment='1 task per gpu')  
+
+# submit a script with 9 combinations of hyper params
+# (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
+cluster.optimize_parallel_cluster_gpu(
+    main,
+    nb_trials=9, # how many permutations of the grid search to run
+    job_name='name_for_squeue'
+)
+```
+
+The other option is that you generate scripts on your own via a bash command or use another library...
+
 ---
 #### Self-balancing architecture
 Here lightning distributes parts of your module across available GPUs to optimize for speed and memory.