#!/bin/bash #SBATCH --nodes=8 #SBATCH --ntasks-per-node=1 #SBATCH --exclusive #SBATCH --time=3-0 #SBATCH --partition=learnfair #SBATCH --error=logs/std-%j.err #SBATCH --output=logs/std-%j.out #SBATCH --gpus-per-node=8 #SBATCH --cpus-per-task=32 #SBATCH --exclude=learnfair[021,025,045,081,082,089,097,098,101,102,103,105] set -x ulimit -c 0 script_name=${1} CHK_DIR="logs/" # Define CHK_DIR LOG_DIR="logs/" # Define LOG_DIR ##### Number of total processes echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX " echo "Nodelist:= " $SLURM_JOB_NODELIST echo "Number of nodes:= " $SLURM_JOB_NUM_NODES echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX " # pretrain export MASTER_ADDR=$(hostname -s) export TORCH_DISTRIBUTED_DEBUG=DETAIL export GLOO_SOCKET_IFNAME= # Set to your network interface export NCCL_SOCKET_IFNAME= # Set to your network interface export PYTHONUNBUFFERED=0 export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_BLOCKING_WAIT=1 export CUDA_LAUNCH_BLOCKING=1 export TORCH_NCCL_BLOCKING_WAIT=1 export NCCL_DEBUG=INFO export NUM_GPU=8 export SLURM_NNODES=8 export SLURM_JOB_NUM_NODES=8 export SLURM_NODEID=${SLURM_NODEID:-0} # Default to 0 if not set export NCCL_IB_DISABLE=1 export NCCL_P2P_DISABLE=1 # find free port MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') DATE="`date +'%d_%m_%Y_%H_%M_%S'`_$$" MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1) WORLD_SIZE=$((${NUM_GPU} * ${SLURM_JOB_NUM_NODES})) RANK=$SLURM_NODEID # Ensure this is set correctly in your environment echo "MASTER_ADDR : ${MASTER_ADDR}" echo "MASTER_PORT : ${MASTER_PORT}" echo "WORLD_SIZE : ${WORLD_SIZE}" echo "RANK : ${RANK}" # wrap with error catch and requeue # run with error catch and requeue and srun function srun_with_requeue { set +e # Trap SIGTERM to requeue if killed by timeout # Trap SIGTERM to requeue if killed by timeout trap 'echo "Caught SIGTERM signal. Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGTERM # Trap SIGUSR1 to requeue if node failure is detected trap 'echo "Caught SIGUSR1 signal (node failure). Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGUSR1 srun --cpu-bind=none "$@" ret=$? if [ $ret -eq 124 ]; then echo "Job timed out. Requeuing..." scontrol requeue $SLURM_JOB_ID exit 0 elif [ $ret -eq 143 ]; then echo "Job timed out. Requeuing..." scontrol requeue $SLURM_JOB_ID exit 0 elif [ $ret -ne 0 ]; then echo "Error in $1 with exit code $ret. Not requeuing." exit $ret else echo "Job completed successfully." exit 0 fi set -e } echo "--------------------------------------------------" >> ~/history.txt echo "Slurm job id | job id | command" >> ~/history.txt echo "$SLURM_JOB_ID | $JOB_ID | $script_name" >> ~/history.txt # srun_with_requeue $script_name chmod +x $script_name srun_with_requeue $script_name