Update multi_node_cluster_auto_slurm.py

This commit is contained in:
William Falcon 2019-09-14 02:14:08 -04:00 committed by GitHub
parent fe17d14ade
commit f3221a5014
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 13 additions and 5 deletions

View File

@ -115,11 +115,19 @@ def optimize_on_cluster(hyperparams):
# set DDP master port
cluster.add_command(f'export MASTER_PORT={PORT}')
# YOU MIGHT NEED THESE
# cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
# cluster.add_command('export NCCL_DEBUG=INFO')
# cluster.add_command('export PYTHONFAULTHANDLER=1')
# cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
# OPTIONAL for debugging
# without these flags errors in your code will
# appear to be nccl errors
cluster.add_command('export NCCL_DEBUG=INFO')
cluster.add_command('export PYTHONFAULTHANDLER=1')
# depending on your cluster config, you probably want
# to limit the wired connection device
# cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
# depending on your cluster, you might need to load
# the latest NCCL version
# cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
# run only on 32GB voltas
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',