|
#!/bin/bash |
|
|
|
a=$(echo $HOSTNAME | cut -c12-16) |
|
|
|
CONFIG=$1 |
|
JOB_NAME=${2:-"experiments"} |
|
GPUS=${3:-8} |
|
|
|
SRUN=${4:-'reserved'} |
|
|
|
GPUS_PER_NODE=${GPUS:-8} |
|
if [ $GPUS_PER_NODE -ge 8 ]; then |
|
GPUS_PER_NODE=8 |
|
fi |
|
CPUS_PER_TASK=${CPUS_PER_TASK:-4} |
|
SRUN_ARGS=${SRUN_ARGS:-""} |
|
|
|
PY_ARGS=${@:5} |
|
|
|
WORK_DIR=${CONFIG//configs/work_dirs} |
|
WORK_DIR=${WORK_DIR//.yaml//$JOB_NAME} |
|
echo $WORK_DIR |
|
mkdir -p $WORK_DIR |
|
mkdir -p data/temp |
|
|
|
now=$(date +"%Y%m%d_%H%M%S") |
|
|
|
a=$(echo $HOSTNAME | cut -c12-16) |
|
|
|
|
|
if [ $a == '140-0' ]; then |
|
export DATA_PATH='/mnt/lustre/share_data/zhujinguo' |
|
export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH |
|
export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions' |
|
export NO_NVRTC=0 |
|
partition='INTERN' |
|
CEPH_CONFIG='slurm_tools/petreloss_1400.config' |
|
SRUNreal=${SRUN} |
|
|
|
if [ ${SRUN} == 'vcspot' ]; then |
|
SRUNreal='spot --async' |
|
partition=VC |
|
elif [ ${SRUN} == 'vcauto' ]; then |
|
SRUNreal='auto --async' |
|
partition=VC |
|
elif [ ${SRUN} == 'vcreserved' ]; then |
|
SRUNreal='reserved' |
|
partition=VC |
|
elif [ ${SRUN} == 'spot' ]; then |
|
SRUNreal='spot --async' |
|
elif [ ${SRUN} == 'auto' ]; then |
|
SRUNreal='auto --async' |
|
|
|
fi |
|
|
|
elif [ $a == '142-4' ]; then |
|
|
|
export DATA_PATH='/mnt/lustre/share_data/zhujinguo' |
|
export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH |
|
export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions' |
|
export NO_NVRTC=0 |
|
partition='vc_research_5' |
|
CEPH_CONFIG='slurm_tools/petreloss_1424.config' |
|
|
|
SRUNreal=${SRUN} |
|
|
|
if [ ${SRUN} == 'vc4spot' ]; then |
|
SRUNreal='spot --async' |
|
partition=vc_research_4 |
|
elif [ ${SRUN} == 'vc4auto' ]; then |
|
SRUNreal='auto --async -x SH-IDC1-10-142-4-76' |
|
partition=vc_research_4 |
|
elif [ ${SRUN} == 'vc4reserved' ]; then |
|
SRUNreal='reserved' |
|
partition=vc_research_4 |
|
elif [ ${SRUN} == 'spot' ]; then |
|
SRUNreal='spot --async' |
|
elif [ ${SRUN} == 'auto' ]; then |
|
SRUNreal='auto --async' |
|
fi |
|
|
|
else |
|
echo only SH1424 and SH1400 supported now |
|
|
|
fi |
|
|
|
srun --partition=${partition} $SRUN_ARGS --quotatype=${SRUNreal} -o $WORK_DIR/phoenix-slurm-%j-$now.out \ |
|
--job-name=${JOB_NAME} -n$GPUS --gres=gpu:${GPUS_PER_NODE} \ |
|
--ntasks-per-node=${GPUS_PER_NODE} \ |
|
--kill-on-bad-exit=1 --cpus-per-task 12 \ |
|
python -u main.py --num-gpus $GPUS \ |
|
--config-file ${CONFIG} --init_method slurm --resume \ |
|
${PY_ARGS} OUTPUT_DIR $WORK_DIR DATALOADER.USE_CEPH True \ |
|
DATALOADER.TCS_CONF_PATH $CEPH_CONFIG SOLVER.CHECKPOINT_PERIOD 10000 SOLVER.CHECKPOINT_MAX_SAVE 1 \ |
|
${OTHERARGS} 2>&1 |
|
|
|
|
|
|
|
|