File size: 1,197 Bytes
246c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
set -x
ulimit -c 0


# check if NUM_GPU is unset, if so set it to 8
NUM_GPU=${NUM_GPU:-8}
# SLURM_JOB_NUM_NODES to 1
SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES:-1}
# MASTER_ADDR to localhost
MASTER_ADDR=${MASTER_ADDR:-localhost}
# MASTER_PORT to 29500
MASTER_PORT=${MASTER_PORT:-29500}
script_name=15dataset_mar_waction_d64_gpu_${NUM_GPU}_nodes_${SLURM_JOB_NUM_NODES}

if [ "$total_memory" -ge 32000 ]; then
    batch_size=8
    gradient_accumulation_steps=1
else
    batch_size=1
    gradient_accumulation_steps=1
    script_name="${script_name}_16g"
fi


torchrun   --nnodes=${SLURM_JOB_NUM_NODES} --nproc_per_node=${NUM_GPU} \
    --rdzv-id=${SLURM_JOB_ID} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \
     train_multi_diffusion.py --genie_config genie/configs/mar_n32_h8_d64_action_modulate.json \
    --output_dir data/$script_name \
    --max_eval_steps 10 \
    --num_episodes_per_dataset 1000000 \
    --per_device_train_batch_size $batch_size \
    --max_train_steps 68536 \
    --gradient_accumulation_steps 1 \
    --run_name $script_name \
    --resume_from_checkpoint data/$script_name/  \
    --train_split experiments/datasplit/dataset15_vae.yaml