lightning/pl_examples/basic_examples/submit_ddp2_job.sh

28 lines
627 B
Bash
Raw Normal View History

#!/bin/bash -l
# SLURM SUBMIT SCRIPT
2019-10-05 18:21:12 +00:00
#SBATCH --nodes=2
#SBATCH --gres=gpu:2
#SBATCH --ntasks-per-node=1
#SBATCH --mem=0
#SBATCH --time=0-02:00:00
# activate conda env
source activate $1
2019-09-11 11:03:31 +00:00
# -------------------------
# debugging flags (optional)
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
2019-09-11 11:03:31 +00:00
# on your cluster you might need these:
# set the network interface
# export NCCL_SOCKET_IFNAME=^docker0,lo
# might need the latest cuda
# module load NCCL/2.4.7-1-cuda.10.0
# -------------------------
# run script from above
srun python3 simple_image_classifier.py --accelerator 'ddp2' --gpus 2 --num_nodes 2 --max_epochs 5