File size: 3,051 Bytes
246c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --time=3-0
#SBATCH --partition=learnfair
#SBATCH --error=logs/std-%j.err
#SBATCH --output=logs/std-%j.out
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=32
#SBATCH --exclude=learnfair[021,025,045,081,082,089,097,098,101,102,103,105]


set -x
ulimit -c 0


script_name=${1}


CHK_DIR="logs/"  # Define CHK_DIR
LOG_DIR="logs/"  # Define LOG_DIR

##### Number of total processes
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= "  $SLURM_NTASKS_PER_NODE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "

# pretrain
export MASTER_ADDR=$(hostname -s)
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export GLOO_SOCKET_IFNAME=  # Set to your network interface
export NCCL_SOCKET_IFNAME=  # Set to your network interface

export PYTHONUNBUFFERED=0
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_BLOCKING_WAIT=1
export CUDA_LAUNCH_BLOCKING=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_DEBUG=INFO
export NUM_GPU=8
export SLURM_NNODES=8
export SLURM_JOB_NUM_NODES=8
export SLURM_NODEID=${SLURM_NODEID:-0}  # Default to 0 if not set
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1

# find free port
MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
DATE="`date +'%d_%m_%Y_%H_%M_%S'`_$$"
MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1)
WORLD_SIZE=$((${NUM_GPU} * ${SLURM_JOB_NUM_NODES}))
RANK=$SLURM_NODEID  # Ensure this is set correctly in your environment

echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "MASTER_PORT : ${MASTER_PORT}"
echo "WORLD_SIZE : ${WORLD_SIZE}"
echo "RANK : ${RANK}"

# wrap with error catch and requeue

# run with error catch and requeue and srun
function srun_with_requeue {
    set +e

    # Trap SIGTERM to requeue if killed by timeout
    # Trap SIGTERM to requeue if killed by timeout
    trap 'echo "Caught SIGTERM signal. Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGTERM

    # Trap SIGUSR1 to requeue if node failure is detected
    trap 'echo "Caught SIGUSR1 signal (node failure). Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGUSR1

    srun --cpu-bind=none "$@"
    ret=$?
    if [ $ret -eq 124 ]; then
        echo "Job timed out. Requeuing..."
        scontrol requeue $SLURM_JOB_ID
        exit 0
    elif [ $ret -eq 143 ]; then
        echo "Job timed out. Requeuing..."
        scontrol requeue $SLURM_JOB_ID
        exit 0
    elif [ $ret -ne 0 ]; then
        echo "Error in $1 with exit code $ret. Not requeuing."
        exit $ret
    else
        echo "Job completed successfully."
        exit 0
    fi

    set -e
}

echo "--------------------------------------------------" >> ~/history.txt
echo "Slurm job id | job id | command" >> ~/history.txt
echo "$SLURM_JOB_ID | $JOB_ID | $script_name" >> ~/history.txt
# srun_with_requeue $script_name
chmod +x $script_name
srun_with_requeue  $script_name