From e739c79819018b24c6d2fb541d8e9bfd947d2af6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 5 Oct 2019 14:21:12 -0400 Subject: [PATCH] cleaned up demos --- examples/multi_node_examples/README.md | 108 +---------- examples/multi_node_examples/demo_script.sh | 66 ------- ...ulti_node_demo_script.sh => job_submit.sh} | 11 +- .../minimal_multi_node_demo.py | 24 --- .../multi_node_cluster_auto_slurm.py | 172 ------------------ ...own_slurm_script.py => multi_node_demo.py} | 28 +-- 6 files changed, 14 insertions(+), 395 deletions(-) delete mode 100644 examples/multi_node_examples/demo_script.sh rename examples/multi_node_examples/{minimal_multi_node_demo_script.sh => job_submit.sh} (71%) delete mode 100644 examples/multi_node_examples/minimal_multi_node_demo.py delete mode 100644 examples/multi_node_examples/multi_node_cluster_auto_slurm.py rename examples/multi_node_examples/{multi_node_own_slurm_script.py => multi_node_demo.py} (58%) diff --git a/examples/multi_node_examples/README.md b/examples/multi_node_examples/README.md index 03f1926cfd..da4086f852 100644 --- a/examples/multi_node_examples/README.md +++ b/examples/multi_node_examples/README.md @@ -1,107 +1,7 @@ -# Multi-node examples -Use these templates for multi-node training. -The main complexity around cluster training is how you submit the SLURM jobs. +# Multi-node example -## Test-tube -Lightning uses test-tube to submit SLURM jobs and to run hyperparameter searches on a cluster. +Run this module to launch a job which runs on 2 nodes each using 2 GPUs. -To run a hyperparameter search, we normally add the values to search to the Hyperparameter optimizer -```python -from test_tube import HyperOptArgumentParser - -parser = HyperOptArgumentParser(strategy='grid_search') -parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True) -parser.opt_list('--learning_rate', default=0.001, type=float, - options=[0.0001, 0.0005, 0.001], - tunable=True) - -# give your model a chance to add its own parameters -parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - -# parse args -hyperparams = parser.parse_args() -``` - -The above sets up a grid search on learning rate and drop probability. You can now add this object to the -cluster object to perform the grid search: -```python -cluster = SlurmCluster( - hyperparam_optimizer=hyperparams, - log_path='/path/to/log/slurm/files', -) - -# ... configure cluster options - -# run grid search on cluster -nb_trials = 6 # (2 drop probs * 3 lrs) -cluster.optimize_parallel_cluster_gpu( - YourMainFunction, - nb_trials=nb_trials, - job_name=hyperparams.experiment_name -) -``` - -Running the above will launch 6 jobs, each with a different drop prob and learning rate combination. -The ```tunable``` parameter must be set to True to add that argument to the space of options, otherwise -Test-Tube will use the ```default=value```. - - -## SLURM Flags -However you decide to submit your jobs, debugging requires a few flags. Without these flags, you'll -see a nccl error instead of the actual error which caused the bug. - -```sh -export NCCL_DEBUG=INFO -export PYTHONFAULTHANDLER=1 -``` - -On some clusters you might need to set the network interface with this flag. -```sh -export NCCL_SOCKET_IFNAME=^docker0,lo -``` - -You might also need to load the latest version of NCCL -```sh -module load NCCL/2.4.7-1-cuda.10.0 -``` - -Finally, you must set the master port (usually a random number between 12k and 20k). -```sh -# random port between 12k and 20k -export MASTER_PORT=$((12000 + RANDOM % 20000))$ -``` - -## Simplest example. -1. Modify this script with your CoolModel file. -2. Update and submit [this bash script](https://github.com/williamFalcon/pytorch-lightning/blob/master/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh) ```bash -squeue minimal_multi_node_demo_script.sh -``` - -## Grid search on a cluster - -#### Option 1: Run on cluster using your own SLURM script -The trainer and model will work on a cluster if you configure your SLURM script correctly. - -1. Update [this demo slurm script](https://github.com/williamFalcon/pytorch-lightning/blob/master/examples/new_project_templates/multi_node_examples/demo_script.sh). -2. Submit the script -```bash -$ squeue demo_script.sh -``` - -Most people have some way they automatically generate their own scripts. -To run a grid search this way, you'd need a way to automatically generate scripts using all the combinations of -hyperparameters to search over. - -#### Option 2: Use test-tube for SLURM script -With test tube we can automatically generate slurm scripts for different hyperparameter options. - -To run this demo: -```bash -source activate YourCondaEnv - -python multi_node_cluster_auto_slurm.py --email your@email.com --gpu_partition your_partition --conda_env YourCondaEnv -``` - -That will submit 6 jobs. Each job will have a specific combination of hyperparams. Each job will also run on 2 nodes -where each node has 8 gpus. +bash job_submit.sh +``` \ No newline at end of file diff --git a/examples/multi_node_examples/demo_script.sh b/examples/multi_node_examples/demo_script.sh deleted file mode 100644 index 96741e7ae6..0000000000 --- a/examples/multi_node_examples/demo_script.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -# -# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube) -################# - -# set a job name -#SBATCH --job-name=lightning_test -################# - -# a file for job output, you can check job progress -#SBATCH --output=/slurm_output_%j.out -################# - -# a file for errors -#SBATCH --error=/slurm_output_%j.err -################# - -# time needed for job -#SBATCH --time=01:00:00 -################# - -# gpus per node -#SBATCH --gres=gpu:8 -################# - -# cpus per job -#SBATCH --cpus-per-task=10 -################# - -# number of requested nodes -#SBATCH --nodes=2 -################# - -# memory per node (0 means all) -#SBATCH --mem=0 -################# - -# slurm will send a signal this far out before it kills the job -#SBATCH --signal=USR1@300 -################# - -# comment -#SBATCH --comment=lightning_demo -################# - -# 1 task per gpu -#SBATCH --ntasks-per-node=8 -################# - -source activate YourEnv - -# debugging flags (optional) -export NCCL_DEBUG=INFO -export PYTHONFAULTHANDLER=1 - -# on your cluster you might need these: -# set the network interface -export NCCL_SOCKET_IFNAME=^docker0,lo - -# might need the latest cuda -module load NCCL/2.4.7-1-cuda.10.0 - -# random port between 12k and 20k -export MASTER_PORT=$((12000 + RANDOM % 20000))$ - -srun python multi_node_own_slurm_script.py \ No newline at end of file diff --git a/examples/multi_node_examples/minimal_multi_node_demo_script.sh b/examples/multi_node_examples/job_submit.sh similarity index 71% rename from examples/multi_node_examples/minimal_multi_node_demo_script.sh rename to examples/multi_node_examples/job_submit.sh index e8154361da..bac61bf415 100755 --- a/examples/multi_node_examples/minimal_multi_node_demo_script.sh +++ b/examples/multi_node_examples/job_submit.sh @@ -1,9 +1,9 @@ #!/bin/bash -l # SLURM SUBMIT SCRIPT -#SBATCH --nodes=4 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=4 +#SBATCH --nodes=2 +#SBATCH --gres=gpu:2 +#SBATCH --ntasks-per-node=2 #SBATCH --mem=0 #SBATCH --time=0-02:00:00 @@ -23,8 +23,5 @@ conda activate my_env # module load NCCL/2.4.7-1-cuda.10.0 # ------------------------- -# random port between 12k and 20k -export MASTER_PORT=$((12000 + RANDOM % 20000)) - # run script from above -python minimal_multi_node_demo.py \ No newline at end of file +python multi_node_demo.py \ No newline at end of file diff --git a/examples/multi_node_examples/minimal_multi_node_demo.py b/examples/multi_node_examples/minimal_multi_node_demo.py deleted file mode 100644 index cdfaa62927..0000000000 --- a/examples/multi_node_examples/minimal_multi_node_demo.py +++ /dev/null @@ -1,24 +0,0 @@ -from pytorch_lightning import Trainer -from test_tube import Experiment -import os - - -def main(): - # use the cool model from the main README.md - model = CoolModel() # noqa: F821 - exp = Experiment(save_dir=os.getcwd()) - - # train on 4 GPUs across 4 nodes - trainer = Trainer( - experiment=exp, - distributed_backend='ddp', - max_nb_epochs=10, - gpus=4, - nb_gpu_nodes=4 - ) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/examples/multi_node_examples/multi_node_cluster_auto_slurm.py b/examples/multi_node_examples/multi_node_cluster_auto_slurm.py deleted file mode 100644 index 352547d710..0000000000 --- a/examples/multi_node_examples/multi_node_cluster_auto_slurm.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Multi-node example (GPU) -""" -import os -import numpy as np -from time import sleep -import torch - -from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - -from examples.basic_examples.lightning_module_template import LightningTemplateModel - -PORT = np.random.randint(12000, 20000, 1)[0] -SEED = 2334 -torch.manual_seed(SEED) -np.random.seed(SEED) - - -def main_local(hparams): - main(hparams, None, None) - - -def main(hparams, cluster): - """ - Main training routine specific for this project - :param hparams: - :return: - """ - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - print('loading model...') - model = LightningTemplateModel(hparams) - print('model built') - - # ------------------------ - # 2 INIT TEST TUBE EXP - # ------------------------ - # when using grid search, it's possible for all models to start at once - # and use the same test tube experiment version - relative_node_id = int(os.environ['SLURM_NODEID']) - sleep(relative_node_id + 1) - - # init experiment - exp = Experiment( - name=hyperparams.experiment_name, - save_dir=hyperparams.test_tube_save_path, - autosave=False, - version=hparams.hpc_exp_number, # match the slurm job version number - description='test demo' - ) - - exp.argparse(hparams) - exp.save() - - # ------------------------ - # 4 INIT TRAINER - # ------------------------ - trainer = Trainer( - experiment=exp, - gpus=hparams.per_experiment_nb_gpus, - nb_gpu_nodes=hyperparams.nb_gpu_nodes, - distributed_backend=hyperparams.distributed_backend - ) - - # ------------------------ - # 5 START TRAINING - # ------------------------ - trainer.fit(model) - - -def optimize_on_cluster(hyperparams): - # enable cluster training - # log all scripts to the test tube folder - cluster = SlurmCluster( - hyperparam_optimizer=hyperparams, - log_path=hyperparams.slurm_log_path, - ) - - # email for cluster coms - cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) - - # configure cluster - cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus - cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes - cluster.job_time = '2:00:00' - cluster.gpu_type = hyperparams.gpu_type - cluster.memory_mb_per_node = 0 - - # any modules for code to run in env - cluster.add_command(f'source activate {hyperparams.conda_env}') - - # set DDP master port - cluster.add_command(f'export MASTER_PORT={PORT}') - - # OPTIONAL for debugging - # without these flags errors in your code will - # appear to be nccl errors - cluster.add_command('export NCCL_DEBUG=INFO') - cluster.add_command('export PYTHONFAULTHANDLER=1') - - # depending on your cluster config, you probably want - # to limit the wired connection device - # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo') - - # depending on your cluster, you might need to load - # the latest NCCL version - # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0']) - - # run only on 32GB voltas - cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, - comment='your cluster might need this argument') - - # run hopt - # creates and submits jobs to slurm - cluster.optimize_parallel_cluster_gpu( - main, - nb_trials=hyperparams.num_hyperparam_trials, - job_name=hyperparams.experiment_name - ) - - -if __name__ == '__main__': - - # use default args - root_dir = os.path.dirname(os.path.realpath(__file__)) - demo_log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs') - - checkpoint_dir = os.path.join(demo_log_dir, 'model_weights') - test_tube_dir = os.path.join(demo_log_dir, 'test_tube_data') - slurm_out_dir = os.path.join(demo_log_dir, 'slurm_scripts') - - parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) - - # cluster args not defined inside the model - - parent_parser.add_argument('--per_experiment_nb_gpus', type=int, - default=8, help='how many gpus to use in a node') - parent_parser.add_argument('--nb_gpu_nodes', type=int, default=2, - help='how many nodes to use in a cluster') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, - help='where to save logs') - parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, - help='where to save slurm meta') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, - help='where to save model') - parent_parser.add_argument('--distributed_backend', type=str, default='ddp', - help='ddp or ddp2') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', - help='test tube exp name') - parent_parser.add_argument('--num_hyperparam_trials', type=int, default=6, - help='how many grid search trials to run') - - parent_parser.add_argument('--email', type=str, default='add@email.com', - help='email for jobs') - parent_parser.add_argument('--conda_env', type=str, default='base', - help='email for jobs') - parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual') - parent_parser.add_argument('--gpu_type', type=str, default='2080ti', help='consult your cluster manual') - - # allow model to overwrite or extend args - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - hyperparams = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - # run on HPC cluster - print('RUNNING ON SLURM CLUSTER') - optimize_on_cluster(hyperparams) \ No newline at end of file diff --git a/examples/multi_node_examples/multi_node_own_slurm_script.py b/examples/multi_node_examples/multi_node_demo.py similarity index 58% rename from examples/multi_node_examples/multi_node_own_slurm_script.py rename to examples/multi_node_examples/multi_node_demo.py index 43662d3f61..577336b888 100644 --- a/examples/multi_node_examples/multi_node_own_slurm_script.py +++ b/examples/multi_node_examples/multi_node_demo.py @@ -5,7 +5,7 @@ import os import numpy as np import torch -from test_tube import HyperOptArgumentParser, Experiment +from argparse import ArgumentParser from pytorch_lightning import Trainer from examples.basic_examples.lightning_module_template import LightningTemplateModel @@ -25,42 +25,26 @@ def main(hparams): # ------------------------ model = LightningTemplateModel(hparams) - # ------------------------ - # 2 INIT TEST TUBE EXP - # ------------------------ - # init experiment - exp = Experiment( - name='test_exp', - save_dir=hyperparams.log_dir, - autosave=False, - description='test demo' - ) - # ------------------------ # 2 INIT TRAINER # ------------------------ trainer = Trainer( - experiment=exp, - gpus=8, + gpus=2, nb_gpu_nodes=2 ) # ------------------------ - # 5 START TRAINING + # 3 START TRAINING # ------------------------ trainer.fit(model) if __name__ == '__main__': - # use current dir for logging + root_dir = os.path.dirname(os.path.realpath(__file__)) - log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs') + parent_parser = ArgumentParser(add_help=False) - parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) - parent_parser.add_argument('--log_dir', type=str, default=log_dir, - help='where to save logs') - - # allow model to overwrite or extend args + # each LightningModule defines arguments relevant to it parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) hyperparams = parser.parse_args()