2019-09-08 22:17:33 +00:00
|
|
|
#!/bin/bash -l
|
|
|
|
|
|
|
|
# SLURM SUBMIT SCRIPT
|
2019-10-05 18:21:12 +00:00
|
|
|
#SBATCH --nodes=2
|
|
|
|
#SBATCH --gres=gpu:2
|
2019-10-05 20:39:05 +00:00
|
|
|
#SBATCH --ntasks-per-node=1
|
2019-09-08 22:17:33 +00:00
|
|
|
#SBATCH --mem=0
|
|
|
|
#SBATCH --time=0-02:00:00
|
|
|
|
|
|
|
|
# activate conda env
|
2019-10-05 20:39:05 +00:00
|
|
|
source activate $1
|
2019-09-08 22:17:33 +00:00
|
|
|
|
2019-09-11 11:03:31 +00:00
|
|
|
# -------------------------
|
|
|
|
# debugging flags (optional)
|
2019-10-05 20:39:05 +00:00
|
|
|
export NCCL_DEBUG=INFO
|
|
|
|
export PYTHONFAULTHANDLER=1
|
2019-09-11 11:03:31 +00:00
|
|
|
|
|
|
|
# on your cluster you might need these:
|
|
|
|
# set the network interface
|
|
|
|
# export NCCL_SOCKET_IFNAME=^docker0,lo
|
|
|
|
|
|
|
|
# might need the latest cuda
|
|
|
|
# module load NCCL/2.4.7-1-cuda.10.0
|
|
|
|
# -------------------------
|
|
|
|
|
2019-09-08 22:17:33 +00:00
|
|
|
# run script from above
|
2019-10-05 20:39:05 +00:00
|
|
|
srun python3 multi_node_ddp2_demo.py
|