Spaces:

liruiw
/

hma

Running on Zero

hma / experiments /scripts /traj_scaling_ablation /run_40datasets_waction_traj100.sh

draft

246c106 about 1 month ago

1.24 kB

	#!/bin/bash
	set -x
	ulimit -c 0


	# check if NUM_GPU is unset, if so set it to 8
	NUM_GPU=${NUM_GPU:-8}
	# SLURM_JOB_NUM_NODES to 1
	SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES:-1}
	# MASTER_ADDR to localhost
	MASTER_ADDR=${MASTER_ADDR:-localhost}
	# MASTER_PORT to 29500
	MASTER_PORT=${MASTER_PORT:-29500}
	script_name=final2_40dataset_waction_traj100_gpu_${NUM_GPU}_nodes_${SLURM_JOB_NUM_NODES}_68536steps

	source venv/bin/activate
	torchrun --nnodes=${SLURM_JOB_NUM_NODES} --nproc_per_node=${NUM_GPU} \
	--rdzv-id=${SLURM_JOB_ID} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \
	train_multi.py --genie_config genie/configs/magvit_n32_h8_d256_action_concat.json \
	--output_dir data/$script_name \
	--max_eval_steps 10 \
	--num_episodes_per_dataset 100 \
	--max_train_steps 30000 \
	--save_second_epoch \
	--per_device_train_batch_size 8 \
	--gradient_accumulation_steps 1 \
	--run_name $script_name \
	--resume_from_checkpoint data/$script_name/ \
	--train_split experiments/datasplit/dataset40.yaml #&&

	#chmod +x experiments/scripts/eval_action_scripts/run_evaluation_multidataset.sh
	#bash experiments/scripts/eval_action_scripts/run_evaluation_multidataset.sh $script_name dataset40