File size: 2,416 Bytes

e7affe4

#!/bin/bash
#SBATCH -N 13
#SBATCH -p tp1-user
#SBATCH --exclusive
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=200
#SBATCH --mem=200G
#SBATCH --gres=gpu:8
#SBATCH --time=30-00:00:00
#SBATCH --output=/mnt/home/ntuspeechlabtaipei1/eric/result/%j-slurm.out
#SBATCH --exclude=cnode3-004,cnode3-019

module purge
module load slurm

source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh
conda activate base

CONTAINER_IMAGE="./eric/trl.sqsh"
GPUS_PER_NODE=8
echo "SLURM_NNODES=${SLURM_NNODES}"
echo "NODELIST="$SLURM_JOB_NODELIST
echo "SLURM_NODEID=$SLURM_NODEID"
echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID"
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=12345
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export CUDA_LAUNCH_BLOCKING=1

export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

SRUN_ARGS=" \
    --wait=60 \
    --kill-on-bad-exit=1 \
    --mpi=pmix \
    --container-image=${CONTAINER_IMAGE} \
    --container-writable \
    --container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \
    "

PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;"

LAUNCHER="accelerate launch \
    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
    --num_machines $SLURM_NNODES \
    --machine_rank \${SLURM_NODEID} \
    --rdzv_backend c10d \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \
    --deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \
    --deepspeed_multinode_launcher standard \
    --dynamo_backend no \
    --use_deepspeed \
    --mixed_precision bf16 \
    "

CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py"

clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD"
echo "END TIME: $(date)"