diff --git a/examples/new_project_templates/multi_node_examples/demo_script.sh b/examples/new_project_templates/multi_node_examples/demo_script.sh index 8188f9aced..96741e7ae6 100644 --- a/examples/new_project_templates/multi_node_examples/demo_script.sh +++ b/examples/new_project_templates/multi_node_examples/demo_script.sh @@ -53,6 +53,13 @@ source activate YourEnv export NCCL_DEBUG=INFO export PYTHONFAULTHANDLER=1 +# on your cluster you might need these: +# set the network interface +export NCCL_SOCKET_IFNAME=^docker0,lo + +# might need the latest cuda +module load NCCL/2.4.7-1-cuda.10.0 + # random port between 12k and 20k export MASTER_PORT=$((12000 + RANDOM % 20000))$ diff --git a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh index ffd1b53213..c98cefedb0 100755 --- a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh +++ b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh @@ -10,5 +10,21 @@ # activate conda env conda activate my_env +# ------------------------- +# debugging flags (optional) +# export NCCL_DEBUG=INFO +# export PYTHONFAULTHANDLER=1 + +# on your cluster you might need these: +# set the network interface +# export NCCL_SOCKET_IFNAME=^docker0,lo + +# might need the latest cuda +# module load NCCL/2.4.7-1-cuda.10.0 +# ------------------------- + +# random port between 12k and 20k +export MASTER_PORT=$((12000 + RANDOM % 20000))$ + # run script from above python minimal_multi_node_demo.py \ No newline at end of file