Update multi_node_cluster_auto_slurm.py

2019-09-13 17:05:49 -04:00 · 2019-09-13 17:05:49 -04:00 · fe17d14ade
parent 9576dd28b2
commit fe17d14ade
1 changed files with 10 additions and 0 deletions
--- a/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py
+++ b/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py
@ -12,6 +12,7 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

 from examples.new_project_templates.lightning_module_template import LightningTemplateModel

+PORT = np.random.randint(12000, 20000, 1)[0]
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
@ -111,6 +112,15 @@ def optimize_on_cluster(hyperparams):
    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')

+    # set DDP master port
+    cluster.add_command(f'export MASTER_PORT={PORT}')
+
+    # YOU MIGHT NEED THESE
+#     cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
+#     cluster.add_command('export NCCL_DEBUG=INFO')
+#     cluster.add_command('export PYTHONFAULTHANDLER=1')
+#     cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
+
    # run only on 32GB voltas
    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
                          comment='use 32gb gpus')