File size: 844 Bytes
32b542e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash

a=$(echo $HOSTNAME | cut  -c12-16)

CONFIG=$1
JOB_NAME=${2:-"experiments"}
GPUS=${3:-8}
  
partition=${4:-'local'} #

GPUS_PER_NODE=${GPUS:-8}
if [ $GPUS_PER_NODE -ge 8 ]; then
  GPUS_PER_NODE=8
fi
CPUS_PER_TASK=${CPUS_PER_TASK:-4}
SRUN_ARGS=${SRUN_ARGS:-""}

PY_ARGS=${@:5}

WORK_DIR=${CONFIG//configs/work_dirs}
WORK_DIR=${WORK_DIR//.yaml//$JOB_NAME}
echo $WORK_DIR
mkdir  -p $WORK_DIR
mkdir -p data/temp

# please change DATA_PATH where you put the training data
export DATA_PATH='/mnt/lustre/share_data/zhujinguo'

srun --partition=${partition}  $SRUN_ARGS \
--job-name=${JOB_NAME} -n$GPUS  --gres=gpu:${GPUS_PER_NODE} \
--ntasks-per-node=${GPUS_PER_NODE} \
--kill-on-bad-exit=1  --cpus-per-task 12 \
python -u main.py --num-gpus $GPUS \
--config-file ${CONFIG} --init_method slurm --resume \
${PY_ARGS} OUTPUT_DIR $WORK_DIR