#!/bin/bash set -x ulimit -c 0 # check if NUM_GPU is unset, if so set it to 8 NUM_GPU=${NUM_GPU:-8} # SLURM_JOB_NUM_NODES to 1 SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES:-1} # MASTER_ADDR to localhost MASTER_ADDR=${MASTER_ADDR:-localhost} # MASTER_PORT to 29500 MASTER_PORT=${MASTER_PORT:-29500} script_name=final2_5dataset_waction_gpu_${NUM_GPU}_nodes_${SLURM_JOB_NUM_NODES} source venv/bin/activate torchrun --nnodes=${SLURM_JOB_NUM_NODES} --nproc_per_node=${NUM_GPU} \ --rdzv-id=${SLURM_JOB_ID} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \ train_multi.py --genie_config genie/configs/magvit_n32_h8_d256_action_concat.json \ --output_dir data/$script_name \ --max_eval_steps 10 \ --num_episodes_per_dataset 1000000 \ --max_train_steps 68536 \ --save_second_epoch \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 1 \ --run_name $script_name \ --train_split experiments/datasplit/dataset5.yaml \ --resume_from_checkpoint data/$script_name/ #&& #chmod +x experiments/scripts/eval_action_scripts/run_evaluation_multidataset.sh #bash experiments/scripts/eval_action_scripts/run_evaluation_multidataset.sh $script_name dataset5