28 lines
565 B
Bash
Executable File
28 lines
565 B
Bash
Executable File
#!/bin/bash -l
|
|
|
|
# SLURM SUBMIT SCRIPT
|
|
#SBATCH --nodes=2
|
|
#SBATCH --gres=gpu:2
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --mem=0
|
|
#SBATCH --time=0-02:00:00
|
|
|
|
# activate conda env
|
|
source activate $1
|
|
|
|
# -------------------------
|
|
# debugging flags (optional)
|
|
export NCCL_DEBUG=INFO
|
|
export PYTHONFAULTHANDLER=1
|
|
|
|
# on your cluster you might need these:
|
|
# set the network interface
|
|
# export NCCL_SOCKET_IFNAME=^docker0,lo
|
|
|
|
# might need the latest cuda
|
|
# module load NCCL/2.4.7-1-cuda.10.0
|
|
# -------------------------
|
|
|
|
# run script from above
|
|
srun python3 multi_node_ddp2_demo.py
|