unit_test / slurm_run.sh
herrius's picture
Upload 259 files
32b542e
#!/bin/bash
a=$(echo $HOSTNAME | cut -c12-16)
CONFIG=$1
JOB_NAME=${2:-"experiments"}
GPUS=${3:-8}
SRUN=${4:-'reserved'}
GPUS_PER_NODE=${GPUS:-8}
if [ $GPUS_PER_NODE -ge 8 ]; then
GPUS_PER_NODE=8
fi
CPUS_PER_TASK=${CPUS_PER_TASK:-4}
SRUN_ARGS=${SRUN_ARGS:-""}
PY_ARGS=${@:5}
WORK_DIR=${CONFIG//configs/work_dirs}
WORK_DIR=${WORK_DIR//.yaml//$JOB_NAME}
echo $WORK_DIR
mkdir -p $WORK_DIR
mkdir -p data/temp
now=$(date +"%Y%m%d_%H%M%S")
a=$(echo $HOSTNAME | cut -c12-16)
if [ $a == '140-0' ]; then
export DATA_PATH='/mnt/lustre/share_data/zhujinguo'
export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH
export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions'
export NO_NVRTC=0
partition='INTERN'
CEPH_CONFIG='slurm_tools/petreloss_1400.config'
SRUNreal=${SRUN}
if [ ${SRUN} == 'vcspot' ]; then
SRUNreal='spot --async'
partition=VC
elif [ ${SRUN} == 'vcauto' ]; then
SRUNreal='auto --async'
partition=VC
elif [ ${SRUN} == 'vcreserved' ]; then
SRUNreal='reserved'
partition=VC
elif [ ${SRUN} == 'spot' ]; then
SRUNreal='spot --async'
elif [ ${SRUN} == 'auto' ]; then
SRUNreal='auto --async'
fi
elif [ $a == '142-4' ]; then
# 1424
export DATA_PATH='/mnt/lustre/share_data/zhujinguo'
export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH
export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions'
export NO_NVRTC=0
partition='vc_research_5'
CEPH_CONFIG='slurm_tools/petreloss_1424.config'
SRUNreal=${SRUN}
if [ ${SRUN} == 'vc4spot' ]; then
SRUNreal='spot --async'
partition=vc_research_4
elif [ ${SRUN} == 'vc4auto' ]; then
SRUNreal='auto --async -x SH-IDC1-10-142-4-76'
partition=vc_research_4
elif [ ${SRUN} == 'vc4reserved' ]; then
SRUNreal='reserved'
partition=vc_research_4
elif [ ${SRUN} == 'spot' ]; then
SRUNreal='spot --async'
elif [ ${SRUN} == 'auto' ]; then
SRUNreal='auto --async'
fi
else
echo only SH1424 and SH1400 supported now
fi
srun --partition=${partition} $SRUN_ARGS --quotatype=${SRUNreal} -o $WORK_DIR/phoenix-slurm-%j-$now.out \
--job-name=${JOB_NAME} -n$GPUS --gres=gpu:${GPUS_PER_NODE} \
--ntasks-per-node=${GPUS_PER_NODE} \
--kill-on-bad-exit=1 --cpus-per-task 12 \
python -u main.py --num-gpus $GPUS \
--config-file ${CONFIG} --init_method slurm --resume \
${PY_ARGS} OUTPUT_DIR $WORK_DIR DATALOADER.USE_CEPH True \
DATALOADER.TCS_CONF_PATH $CEPH_CONFIG SOLVER.CHECKPOINT_PERIOD 10000 SOLVER.CHECKPOINT_MAX_SAVE 1 \
${OTHERARGS} 2>&1
# SOLVER.ACCUM_ITER 2 SOLVER.CHECKPOINT_PERIOD 1000 SOLVER.CHECKPOINT_MAX_SAVE 1 MODEL.BERT.DROP_PATH_PROB 0.1