Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,168 Bytes
246c106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
#!/bin/bash
set -x
ulimit -c 0
# check if NUM_GPU is unset, if so set it to 8
NUM_GPU=${NUM_GPU:-8}
# SLURM_JOB_NUM_NODES to 1
SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES:-1}
# MASTER_ADDR to localhost
MASTER_ADDR=${MASTER_ADDR:-localhost}
# MASTER_PORT to 29500
MASTER_PORT=${MASTER_PORT:-29500}
script_name=33dataset_mar_waction_d128_gpu_${NUM_GPU}_nodes_${SLURM_JOB_NUM_NODES}
if [ "$total_memory" -ge 32000 ]; then
batch_size=8
gradient_accumulation_steps=1
else
batch_size=1
gradient_accumulation_steps=1
script_name="${script_name}_16g"
fi
torchrun --nnodes=${SLURM_JOB_NUM_NODES} --nproc_per_node=${NUM_GPU} \
--rdzv-id=${SLURM_JOB_ID} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \
train_multi_diffusion.py --genie_config genie/configs/mar_n32_h8_d128_action_modulate.json \
--output_dir data/$script_name \
--max_eval_steps 10 \
--num_episodes_per_dataset 1000000 \
--per_device_train_batch_size $batch_size \
--gradient_accumulation_steps 1 \
--run_name $script_name \
--resume_from_checkpoint data/$script_name/ \
--train_split experiments/datasplit/dataset30_vae.yaml
|