multitensor commited on
Commit
5e877ac
·
verified ·
1 Parent(s): 681e79d

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. pretrain_all.sh +61 -14
pretrain_all.sh CHANGED
@@ -1,28 +1,75 @@
1
- CUDA_VISIBLE_DEVICES=0,1 deepspeed llava/train/train_mem.py \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  --deepspeed ./scripts/zero2.json \
3
- --model_name_or_path ./Meta-Llama-3.1-8B-Instruct \
4
  --version plain \
5
- --data_path train_json/video_image_asr_caption_pre.json \
6
- --audio_asr_folder /home/yu/Downloads \
7
- --audio_caption_folder /media/yu/33da5c9f-d06a-4b18-acc4-b4e1c20292471/0818 \
8
- --video_folder video_data \
9
- --image_folder /media/yu/33da5c9f-d06a-4b18-acc4-b4e1c20292471 \
10
  --X "Audio_asr" "Audio_caption" "Video" "Image" \
11
- --audio_tower ./LanguageBind_Audio_Asr \
12
- --audio_caption_tower LanguageBind/LanguageBind_Audio \
13
- --video_tower LanguageBind/LanguageBind_Video_merge \
14
- --image_tower LanguageBind/LanguageBind_Image \
15
  --mm_projector_type mlp2x_gelu \
16
  --tune_mm_mlp_adapter True \
17
  --mm_vision_select_layer -2 \
18
  --mm_use_x_start_end False \
19
  --mm_use_x_patch_token False \
20
  --bf16 True \
21
- --output_dir ./checkpoints/Video-LLaVA-Pretrain-7B \
22
  --num_train_epochs 1 \
23
- --per_device_train_batch_size 1 \
24
  --per_device_eval_batch_size 16 \
25
- --gradient_accumulation_steps 32 \
26
  --evaluation_strategy "no" \
27
  --save_strategy "steps" \
28
  --save_steps 2000 \
 
1
+ #!/bin/bash
2
+
3
+ # Pretrain a multimodal model.
4
+ export OMP_NUM_THREADS=8
5
+ export NCCL_IB_DISABLE=0
6
+ export NCCL_IB_GID_INDEX=3
7
+ export NCCL_SOCKET_IFNAME=eth0
8
+ export NCCL_DEBUG=INFO
9
+ export NCCL_IB_SL=1
10
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
11
+ export NCCL_BLOCKING_WAIT=1
12
+ export NCCL_ASYNC_ERROR_HANDLING=1
13
+ export NCCL_TIMEOUT=500
14
+ export TORCH_DISTRIBUTED_DEBUG=DETAIL
15
+
16
+ DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
17
+
18
+ # Setting for multi nodes training.
19
+ ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
20
+ port=${ports[0]}
21
+
22
+ echo "total workers: ${ARNOLD_WORKER_NUM}"
23
+ echo "cur worker id: ${ARNOLD_ID}"
24
+ echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
25
+ echo "master ip: ${METIS_WORKER_0_HOST}"
26
+ echo "master port: ${port}"
27
+
28
+ source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
29
+
30
+ cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main
31
+
32
+ # Install necessary packages
33
+ pip3 install requests
34
+ pip3 install attrs
35
+ pip3 install aiofiles
36
+ pip3 install pynvml
37
+
38
+
39
+ # Print Python executable path, torchrun, deepspeed and PYTHONPATH
40
+ echo "Python executable: $(which python)"
41
+ echo "torchrun executable: $(which torchrun)"
42
+ echo "deepspeed executable: $(which deepspeed)"
43
+ echo "PYTHONPATH before torchrun: $PYTHONPATH"
44
+
45
+ sudo chmod 777 /var/lib/fastrak -R
46
+
47
+ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
48
+ llava/train/train_mem.py \
49
  --deepspeed ./scripts/zero2.json \
50
+ --model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/Meta-Llama-3.1-8B-Instruct \
51
  --version plain \
52
+ --data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1102.json \
53
+ --audio_asr_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data \
54
+ --audio_caption_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818 \
55
+ --video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
56
+ --image_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
57
  --X "Audio_asr" "Audio_caption" "Video" "Image" \
58
+ --audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
59
+ --audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
60
+ --video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
61
+ --image_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \
62
  --mm_projector_type mlp2x_gelu \
63
  --tune_mm_mlp_adapter True \
64
  --mm_vision_select_layer -2 \
65
  --mm_use_x_start_end False \
66
  --mm_use_x_patch_token False \
67
  --bf16 True \
68
+ --output_dir ./checkpoints/Video-LLaVA-Pretrain-7B-1102 \
69
  --num_train_epochs 1 \
70
+ --per_device_train_batch_size 16 \
71
  --per_device_eval_batch_size 16 \
72
+ --gradient_accumulation_steps 1 \
73
  --evaluation_strategy "no" \
74
  --save_strategy "steps" \
75
  --save_steps 2000 \