#!/bin/bash # Pretrain a multimodal model. export OMP_NUM_THREADS=8 export NCCL_IB_DISABLE=0 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=eth0 export NCCL_DEBUG=INFO export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_BLOCKING_WAIT=1 export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_TIMEOUT=500 export TORCH_DISTRIBUTED_DEBUG=DETAIL export NCCL_SOCKET_FAMILY=AF_INET6 export NCCL_NET_PLUGIN=libnccl-net-gcp-fastrak.so DATETIME=`date +'%y-%m-%d-%H-%M-%S'` # Setting for multi nodes training. ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) port=${ports[0]} echo "total workers: ${ARNOLD_WORKER_NUM}" echo "cur worker id: ${ARNOLD_ID}" echo "gpus per worker: ${ARNOLD_WORKER_GPU}" echo "master ip: ${METIS_WORKER_0_HOST}" echo "master port: ${port}" source /mnt/bn/tns-algo-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal cd /mnt/bn/tns-algo-public-my2/wangpeng.an/train/OmniFusion-main # Install necessary packages pip3 install requests pip3 install attrs pip3 install aiofiles pip3 install pynvml # Print Python executable path, torchrun, deepspeed and PYTHONPATH echo "Python executable: $(which python)" echo "torchrun executable: $(which torchrun)" echo "deepspeed executable: $(which deepspeed)" echo "PYTHONPATH before torchrun: $PYTHONPATH" sudo chmod 777 /var/lib/fastrak -R ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \ llava/train/train_mem.py \ --deepspeed ./scripts/zero2.json \ --model_name_or_path /mnt/bn/tns-algo-public-my2/wangpeng.an/model/Meta-Llama-3-8B-Instruct \ --version plain \ --data_path /mnt/bn/tns-algo-public-my2/wangpeng.an/data/annotations/video_image_caption_asr_stage1.json \ --audio_asr_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data \ --audio_caption_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/caption_data/0818 \ --video_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \ --image_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \ --X "Audio_asr" "Audio_caption" "Video" "Image" \ --audio_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \ --audio_caption_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \ --video_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \ --image_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \ --mm_projector_type mlp2x_gelu \ --tune_mm_mlp_adapter True \ --mm_vision_select_layer -2 \ --mm_use_x_start_end False \ --mm_use_x_patch_token False \ --bf16 True \ --output_dir ./checkpoints/Omni-Pretrain-8B-llama-0209 \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 16 \ --gradient_accumulation_steps 2 \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 2000 \ --save_total_limit 20 \ --learning_rate 1e-3 \ --weight_decay 0. \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --tf32 True \ --model_max_length 2048 \ --tokenizer_model_max_length 3072 \ --gradient_checkpointing True \ --dataloader_num_workers 8 \ --lazy_preprocess True \ --report_to tensorboard \ --cache_dir "./cache_dir"