Spaces:
Running
on
Zero
Running
on
Zero
set -x | |
ulimit -c 0 | |
# check if NUM_GPU is unset, if so set it to 8 | |
NUM_GPU=${NUM_GPU:-8} | |
# SLURM_JOB_NUM_NODES to 1 | |
SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES:-1} | |
# MASTER_ADDR to localhost | |
MASTER_ADDR=${MASTER_ADDR:-localhost} | |
# MASTER_PORT to 29500 | |
MASTER_PORT=${MASTER_PORT:-29500} | |
script_name=33dataset_mar_waction_d128_gpu_${NUM_GPU}_nodes_${SLURM_JOB_NUM_NODES} | |
if [ "$total_memory" -ge 32000 ]; then | |
batch_size=8 | |
gradient_accumulation_steps=1 | |
else | |
batch_size=1 | |
gradient_accumulation_steps=1 | |
script_name="${script_name}_16g" | |
fi | |
torchrun --nnodes=${SLURM_JOB_NUM_NODES} --nproc_per_node=${NUM_GPU} \ | |
--rdzv-id=${SLURM_JOB_ID} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \ | |
train_multi_diffusion.py --genie_config genie/configs/mar_n32_h8_d128_action_modulate.json \ | |
--output_dir data/$script_name \ | |
--max_eval_steps 10 \ | |
--num_episodes_per_dataset 1000000 \ | |
--per_device_train_batch_size $batch_size \ | |
--gradient_accumulation_steps 1 \ | |
--run_name $script_name \ | |
--resume_from_checkpoint data/$script_name/ \ | |
--train_split experiments/datasplit/dataset30_vae.yaml | |