From e739c79819018b24c6d2fb541d8e9bfd947d2af6 Mon Sep 17 00:00:00 2001
From: William Falcon <falc@fb.com>
Date: Sat, 5 Oct 2019 14:21:12 -0400
Subject: [PATCH] cleaned up demos

---
 examples/multi_node_examples/README.md        | 108 +----------
 examples/multi_node_examples/demo_script.sh   |  66 -------
 ...ulti_node_demo_script.sh => job_submit.sh} |  11 +-
 .../minimal_multi_node_demo.py                |  24 ---
 .../multi_node_cluster_auto_slurm.py          | 172 ------------------
 ...own_slurm_script.py => multi_node_demo.py} |  28 +--
 6 files changed, 14 insertions(+), 395 deletions(-)
 delete mode 100644 examples/multi_node_examples/demo_script.sh
 rename examples/multi_node_examples/{minimal_multi_node_demo_script.sh => job_submit.sh} (71%)
 delete mode 100644 examples/multi_node_examples/minimal_multi_node_demo.py
 delete mode 100644 examples/multi_node_examples/multi_node_cluster_auto_slurm.py
 rename examples/multi_node_examples/{multi_node_own_slurm_script.py => multi_node_demo.py} (58%)

diff --git a/examples/multi_node_examples/README.md b/examples/multi_node_examples/README.md
index 03f1926cfd..da4086f852 100644
--- a/examples/multi_node_examples/README.md
+++ b/examples/multi_node_examples/README.md
@@ -1,107 +1,7 @@
-# Multi-node examples
-Use these templates for multi-node training. 
-The main complexity around cluster training is how you submit the SLURM jobs.  
+# Multi-node example   
 
-## Test-tube   
-Lightning uses test-tube to submit SLURM jobs and to run hyperparameter searches on a cluster.  
+Run this module to launch a job which runs on 2 nodes each using 2 GPUs.   
 
-To run a hyperparameter search, we normally add the values to search to the Hyperparameter optimizer 
-```python
-from test_tube import HyperOptArgumentParser
-
-parser = HyperOptArgumentParser(strategy='grid_search')
-parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
-parser.opt_list('--learning_rate', default=0.001, type=float,
-                        options=[0.0001, 0.0005, 0.001],
-                        tunable=True)
-                        
-# give your model a chance to add its own parameters
-parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-
-# parse args
-hyperparams = parser.parse_args()
-```
-
-The above sets up a grid search on learning rate and drop probability. You can now add this object to the 
-cluster object to perform the grid search:   
-```python
-cluster = SlurmCluster(
-    hyperparam_optimizer=hyperparams,
-    log_path='/path/to/log/slurm/files',
-)
-
-# ... configure cluster options
-
-# run grid search on cluster
-nb_trials = 6   # (2 drop probs * 3 lrs)
-cluster.optimize_parallel_cluster_gpu(
-    YourMainFunction,
-    nb_trials=nb_trials,
-    job_name=hyperparams.experiment_name
-)
-```
-
-Running the above will launch 6 jobs, each with a different drop prob and learning rate combination.   
-The ```tunable``` parameter must be set to True to add that argument to the space of options, otherwise
-Test-Tube will use the ```default=value```.    
-
-
-## SLURM Flags   
-However you decide to submit your jobs, debugging requires a few flags. Without these flags, you'll
-see a nccl error instead of the actual error which caused the bug.   
-
-```sh
-export NCCL_DEBUG=INFO
-export PYTHONFAULTHANDLER=1
-```
-
-On some clusters you might need to set the network interface with this flag.   
-```sh
-export NCCL_SOCKET_IFNAME=^docker0,lo
-```   
-
-You might also need to load the latest version of NCCL  
-```sh
-module load NCCL/2.4.7-1-cuda.10.0
-```
-
-Finally, you must set the master port (usually a random number between 12k and 20k).   
-```sh
-# random port between 12k and 20k
-export MASTER_PORT=$((12000 + RANDOM % 20000))$   
-```
-
-## Simplest example.   
-1. Modify this script with your CoolModel file.   
-2. Update and submit [this bash script](https://github.com/williamFalcon/pytorch-lightning/blob/master/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh)   
 ```bash
-squeue minimal_multi_node_demo_script.sh
-```
-
-## Grid search on a cluster   
-
-#### Option 1: Run on cluster using your own SLURM script    
-The trainer and model will work on a cluster if you configure your SLURM script correctly.   
-
-1. Update [this demo slurm script](https://github.com/williamFalcon/pytorch-lightning/blob/master/examples/new_project_templates/multi_node_examples/demo_script.sh).  
-2. Submit the script   
-```bash
-$ squeue demo_script.sh
-```
-
-Most people have some way they automatically generate their own scripts.  
-To run a grid search this way, you'd need a way to automatically generate scripts using all the combinations of 
-hyperparameters to search over.   
-
-#### Option 2: Use test-tube for SLURM script
-With test tube we can automatically generate slurm scripts for different hyperparameter options.   
-
-To run this demo:    
-```bash
-source activate YourCondaEnv
-
-python multi_node_cluster_auto_slurm.py --email your@email.com --gpu_partition your_partition --conda_env YourCondaEnv
-```
-
-That will submit 6 jobs. Each job will have a specific combination of hyperparams. Each job will also run on 2 nodes
-where each node has 8 gpus.   
+bash job_submit.sh
+```
\ No newline at end of file
diff --git a/examples/multi_node_examples/demo_script.sh b/examples/multi_node_examples/demo_script.sh
deleted file mode 100644
index 96741e7ae6..0000000000
--- a/examples/multi_node_examples/demo_script.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-#
-# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)
-#################
-
-# set a job name
-#SBATCH --job-name=lightning_test
-#################
-
-# a file for job output, you can check job progress
-#SBATCH --output=/slurm_output_%j.out
-#################
-
-# a file for errors
-#SBATCH --error=/slurm_output_%j.err
-#################
-
-# time needed for job
-#SBATCH --time=01:00:00
-#################
-
-# gpus per node
-#SBATCH --gres=gpu:8
-#################
-
-# cpus per job
-#SBATCH --cpus-per-task=10
-#################
-
-# number of requested nodes
-#SBATCH --nodes=2
-#################
-
-# memory per node (0 means all)
-#SBATCH --mem=0
-#################
-
-# slurm will send a signal this far out before it kills the job
-#SBATCH --signal=USR1@300
-#################
-
-# comment
-#SBATCH --comment=lightning_demo
-#################
-
-# 1 task per gpu
-#SBATCH --ntasks-per-node=8
-#################
-
-source activate YourEnv
-
-# debugging flags (optional)
-export NCCL_DEBUG=INFO
-export PYTHONFAULTHANDLER=1
-
-# on your cluster you might need these:
-# set the network interface
-export NCCL_SOCKET_IFNAME=^docker0,lo
-
-# might need the latest cuda
-module load NCCL/2.4.7-1-cuda.10.0
-
-# random port between 12k and 20k
-export MASTER_PORT=$((12000 + RANDOM % 20000))$
-
-srun python multi_node_own_slurm_script.py
\ No newline at end of file
diff --git a/examples/multi_node_examples/minimal_multi_node_demo_script.sh b/examples/multi_node_examples/job_submit.sh
similarity index 71%
rename from examples/multi_node_examples/minimal_multi_node_demo_script.sh
rename to examples/multi_node_examples/job_submit.sh
index e8154361da..bac61bf415 100755
--- a/examples/multi_node_examples/minimal_multi_node_demo_script.sh
+++ b/examples/multi_node_examples/job_submit.sh
@@ -1,9 +1,9 @@
 #!/bin/bash -l
 
 # SLURM SUBMIT SCRIPT
-#SBATCH --nodes=4
-#SBATCH --gres=gpu:4
-#SBATCH --ntasks-per-node=4
+#SBATCH --nodes=2
+#SBATCH --gres=gpu:2
+#SBATCH --ntasks-per-node=2
 #SBATCH --mem=0
 #SBATCH --time=0-02:00:00
 
@@ -23,8 +23,5 @@ conda activate my_env
 # module load NCCL/2.4.7-1-cuda.10.0
 # -------------------------
 
-# random port between 12k and 20k
-export MASTER_PORT=$((12000 + RANDOM % 20000))
-
 # run script from above
-python minimal_multi_node_demo.py
\ No newline at end of file
+python multi_node_demo.py
\ No newline at end of file
diff --git a/examples/multi_node_examples/minimal_multi_node_demo.py b/examples/multi_node_examples/minimal_multi_node_demo.py
deleted file mode 100644
index cdfaa62927..0000000000
--- a/examples/multi_node_examples/minimal_multi_node_demo.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from pytorch_lightning import Trainer
-from test_tube import Experiment
-import os
-
-
-def main():
-    # use the cool model from the main README.md
-    model = CoolModel()  # noqa: F821
-    exp = Experiment(save_dir=os.getcwd())
-
-    # train on 4 GPUs across 4 nodes
-    trainer = Trainer(
-        experiment=exp,
-        distributed_backend='ddp',
-        max_nb_epochs=10,
-        gpus=4,
-        nb_gpu_nodes=4
-    )
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/multi_node_examples/multi_node_cluster_auto_slurm.py b/examples/multi_node_examples/multi_node_cluster_auto_slurm.py
deleted file mode 100644
index 352547d710..0000000000
--- a/examples/multi_node_examples/multi_node_cluster_auto_slurm.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""
-Multi-node example (GPU)
-"""
-import os
-import numpy as np
-from time import sleep
-import torch
-
-from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-
-from examples.basic_examples.lightning_module_template import LightningTemplateModel
-
-PORT = np.random.randint(12000, 20000, 1)[0]
-SEED = 2334
-torch.manual_seed(SEED)
-np.random.seed(SEED)
-
-
-def main_local(hparams):
-    main(hparams, None, None)
-
-
-def main(hparams, cluster):
-    """
-    Main training routine specific for this project
-    :param hparams:
-    :return:
-    """
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    print('loading model...')
-    model = LightningTemplateModel(hparams)
-    print('model built')
-
-    # ------------------------
-    # 2 INIT TEST TUBE EXP
-    # ------------------------
-    # when using grid search, it's possible for all models to start at once
-    # and use the same test tube experiment version
-    relative_node_id = int(os.environ['SLURM_NODEID'])
-    sleep(relative_node_id + 1)
-
-    # init experiment
-    exp = Experiment(
-        name=hyperparams.experiment_name,
-        save_dir=hyperparams.test_tube_save_path,
-        autosave=False,
-        version=hparams.hpc_exp_number,  # match the slurm job version number
-        description='test demo'
-    )
-
-    exp.argparse(hparams)
-    exp.save()
-
-    # ------------------------
-    # 4 INIT TRAINER
-    # ------------------------
-    trainer = Trainer(
-        experiment=exp,
-        gpus=hparams.per_experiment_nb_gpus,
-        nb_gpu_nodes=hyperparams.nb_gpu_nodes,
-        distributed_backend=hyperparams.distributed_backend
-    )
-
-    # ------------------------
-    # 5 START TRAINING
-    # ------------------------
-    trainer.fit(model)
-
-
-def optimize_on_cluster(hyperparams):
-    # enable cluster training
-    # log all scripts to the test tube folder
-    cluster = SlurmCluster(
-        hyperparam_optimizer=hyperparams,
-        log_path=hyperparams.slurm_log_path,
-    )
-
-    # email for cluster coms
-    cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)
-
-    # configure cluster
-    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
-    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
-    cluster.job_time = '2:00:00'
-    cluster.gpu_type = hyperparams.gpu_type
-    cluster.memory_mb_per_node = 0
-
-    # any modules for code to run in env
-    cluster.add_command(f'source activate {hyperparams.conda_env}')
-
-    # set DDP master port
-    cluster.add_command(f'export MASTER_PORT={PORT}')
-
-    # OPTIONAL for debugging
-    # without these flags errors in your code will
-    # appear to be nccl errors
-    cluster.add_command('export NCCL_DEBUG=INFO')
-    cluster.add_command('export PYTHONFAULTHANDLER=1')
-
-    # depending on your cluster config, you probably want
-    # to limit the wired connection device
-    # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
-
-    # depending on your cluster, you might need to load
-    # the latest NCCL version
-    # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
-
-    # run only on 32GB voltas
-    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
-                          comment='your cluster might need this argument')
-
-    # run hopt
-    # creates and submits jobs to slurm
-    cluster.optimize_parallel_cluster_gpu(
-        main,
-        nb_trials=hyperparams.num_hyperparam_trials,
-        job_name=hyperparams.experiment_name
-    )
-
-
-if __name__ == '__main__':
-
-    # use default args
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    demo_log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs')
-
-    checkpoint_dir = os.path.join(demo_log_dir, 'model_weights')
-    test_tube_dir = os.path.join(demo_log_dir, 'test_tube_data')
-    slurm_out_dir = os.path.join(demo_log_dir, 'slurm_scripts')
-
-    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
-
-    # cluster args not defined inside the model
-
-    parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
-                               default=8, help='how many gpus to use in a node')
-    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=2,
-                               help='how many nodes to use in a cluster')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
-                               help='where to save logs')
-    parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir,
-                               help='where to save slurm meta')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
-                               help='where to save model')
-    parent_parser.add_argument('--distributed_backend', type=str, default='ddp',
-                               help='ddp or ddp2')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
-                               help='test tube exp name')
-    parent_parser.add_argument('--num_hyperparam_trials', type=int, default=6,
-                               help='how many grid search trials to run')
-
-    parent_parser.add_argument('--email', type=str, default='add@email.com',
-                               help='email for jobs')
-    parent_parser.add_argument('--conda_env', type=str, default='base',
-                               help='email for jobs')
-    parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
-    parent_parser.add_argument('--gpu_type', type=str, default='2080ti', help='consult your cluster manual')
-
-    # allow model to overwrite or extend args
-    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-    hyperparams = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    # run on HPC cluster
-    print('RUNNING ON SLURM CLUSTER')
-    optimize_on_cluster(hyperparams)
\ No newline at end of file
diff --git a/examples/multi_node_examples/multi_node_own_slurm_script.py b/examples/multi_node_examples/multi_node_demo.py
similarity index 58%
rename from examples/multi_node_examples/multi_node_own_slurm_script.py
rename to examples/multi_node_examples/multi_node_demo.py
index 43662d3f61..577336b888 100644
--- a/examples/multi_node_examples/multi_node_own_slurm_script.py
+++ b/examples/multi_node_examples/multi_node_demo.py
@@ -5,7 +5,7 @@ import os
 import numpy as np
 import torch
 
-from test_tube import HyperOptArgumentParser, Experiment
+from argparse import ArgumentParser
 from pytorch_lightning import Trainer
 from examples.basic_examples.lightning_module_template import LightningTemplateModel
 
@@ -25,42 +25,26 @@ def main(hparams):
     # ------------------------
     model = LightningTemplateModel(hparams)
 
-    # ------------------------
-    # 2 INIT TEST TUBE EXP
-    # ------------------------
-    # init experiment
-    exp = Experiment(
-        name='test_exp',
-        save_dir=hyperparams.log_dir,
-        autosave=False,
-        description='test demo'
-    )
-
     # ------------------------
     # 2 INIT TRAINER
     # ------------------------
     trainer = Trainer(
-        experiment=exp,
-        gpus=8,
+        gpus=2,
         nb_gpu_nodes=2
     )
 
     # ------------------------
-    # 5 START TRAINING
+    # 3 START TRAINING
     # ------------------------
     trainer.fit(model)
 
 
 if __name__ == '__main__':
-    # use current dir for logging
+
     root_dir = os.path.dirname(os.path.realpath(__file__))
-    log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs')
+    parent_parser = ArgumentParser(add_help=False)
 
-    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
-    parent_parser.add_argument('--log_dir', type=str, default=log_dir,
-                               help='where to save logs')
-
-    # allow model to overwrite or extend args
+    # each LightningModule defines arguments relevant to it
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
     hyperparams = parser.parse_args()