multitensor commited on
Commit
c180ed8
·
verified ·
1 Parent(s): 9c37e76

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,11 +1,8 @@
1
  {
2
  "X": [
3
- "Audio_asr",
4
- "Audio_caption",
5
- "Video",
6
- "Image"
7
  ],
8
- "_name_or_path": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018",
9
  "architectures": [
10
  "LlavaLlamaForCausalLM"
11
  ],
@@ -24,6 +21,7 @@
24
  "image_grid_pinpoints": null,
25
  "initializer_range": 0.02,
26
  "intermediate_size": 14336,
 
27
  "max_position_embeddings": 131072,
28
  "mlp_bias": false,
29
  "mm_audio_caption_tower": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
@@ -51,7 +49,7 @@
51
  },
52
  "rope_theta": 500000.0,
53
  "tie_word_embeddings": false,
54
- "tokenizer_model_max_length": 3072,
55
  "torch_dtype": "bfloat16",
56
  "transformers_version": "4.43.1",
57
  "tune_mm_mlp_adapter": false,
 
1
  {
2
  "X": [
3
+ "Video"
 
 
 
4
  ],
5
+ "_name_or_path": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/tmp/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018",
6
  "architectures": [
7
  "LlavaLlamaForCausalLM"
8
  ],
 
21
  "image_grid_pinpoints": null,
22
  "initializer_range": 0.02,
23
  "intermediate_size": 14336,
24
+ "is_fusion": true,
25
  "max_position_embeddings": 131072,
26
  "mlp_bias": false,
27
  "mm_audio_caption_tower": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
 
49
  },
50
  "rope_theta": 500000.0,
51
  "tie_word_embeddings": false,
52
+ "tokenizer_model_max_length": 6144,
53
  "torch_dtype": "bfloat16",
54
  "transformers_version": "4.43.1",
55
  "tune_mm_mlp_adapter": false,
finetune_all_multinode_stage4.sh CHANGED
@@ -27,7 +27,7 @@ echo "master port: ${port}"
27
 
28
  source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
29
 
30
- cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-stage4
31
 
32
  # Install necessary packages
33
  pip3 install requests
@@ -48,14 +48,11 @@ sudo chmod 777 /var/lib/fastrak -R
48
  ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
49
  llava/train/train_mem.py \
50
  --deepspeed ./scripts/zero2.json \
51
- --model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018 \
52
  --version llama_3_1 \
53
- --data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage4/videochatgpt_tune_stage4.json \
54
- --audio_asr_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data \
55
- --audio_caption_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/audio_caption_data_tune/audio_caption_tune/audio_caption \
56
  --video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
57
- --image_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/videos_images_tune/video_images_tune/videos_images_tune \
58
- --X "Audio_asr" "Audio_caption" "Video" "Image" \
59
  --audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
60
  --audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
61
  --video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
@@ -64,10 +61,11 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=
64
  --mm_vision_select_layer -2 \
65
  --mm_use_x_start_end False \
66
  --mm_use_x_patch_token False \
 
67
  --image_aspect_ratio pad \
68
  --group_by_modality_length True \
69
  --bf16 True \
70
- --output_dir ./checkpoints/OmniFusion-8B-stage4-1018 \
71
  --num_train_epochs 1 \
72
  --per_device_train_batch_size 8 \
73
  --per_device_eval_batch_size 4 \
@@ -82,8 +80,8 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=
82
  --lr_scheduler_type "cosine" \
83
  --logging_steps 1 \
84
  --tf32 True \
85
- --model_max_length 2048 \
86
- --tokenizer_model_max_length 3072 \
87
  --gradient_checkpointing True \
88
  --dataloader_num_workers 8 \
89
  --lazy_preprocess True \
 
27
 
28
  source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
29
 
30
+ cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main
31
 
32
  # Install necessary packages
33
  pip3 install requests
 
48
  ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
49
  llava/train/train_mem.py \
50
  --deepspeed ./scripts/zero2.json \
51
+ --model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/tmp/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018 \
52
  --version llama_3_1 \
53
+ --data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage4_1031/videochatgpt_tune_stage4.json \
 
 
54
  --video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
55
+ --X "Video" \
 
56
  --audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
57
  --audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
58
  --video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
 
61
  --mm_vision_select_layer -2 \
62
  --mm_use_x_start_end False \
63
  --mm_use_x_patch_token False \
64
+ --is_fusion True \
65
  --image_aspect_ratio pad \
66
  --group_by_modality_length True \
67
  --bf16 True \
68
+ --output_dir ./checkpoints/OmniFusion-8B-stage4-1031 \
69
  --num_train_epochs 1 \
70
  --per_device_train_batch_size 8 \
71
  --per_device_eval_batch_size 4 \
 
80
  --lr_scheduler_type "cosine" \
81
  --logging_steps 1 \
82
  --tf32 True \
83
+ --model_max_length 4096 \
84
+ --tokenizer_model_max_length 6144 \
85
  --gradient_checkpointing True \
86
  --dataloader_num_workers 8 \
87
  --lazy_preprocess True \
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2089a58707cd47ab2f223e771bb25497981ab2b0a73bd577a227bee58c40472
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:346f2705a5e9ae972033847876dea0bcedd1ee56116c9c23cfa72ded222e8214
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b43ddd7482158e8dd601c333eb6073d611a28e3cb2181011091901d36a9899c1
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bab15611dfc404180dd875368f2e343eef0be202143436f74781821fdb00be9
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cecf1fc903fb3490a9ca8b236f84c935118b828210e67033835675b188a8749
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:134edcbecc2c4602a9f3276d0014ec68dfd9b379af4b1b750f6524cabb9703ab
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c90441004ce65cb1a190ba06afc2f495971903a283e3d91a06c464ea345945c3
3
  size 3851682320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ba906825d1bb6330573ed40c45cfa9dcb3e28a86ddf494f4f498321e6fbb86
3
  size 3851682320
tokenizer_config.json CHANGED
@@ -2057,7 +2057,7 @@
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
- "model_max_length": 2048,
2061
  "pad_token": "<|finetune_right_pad_id|>",
2062
  "padding_side": "right",
2063
  "tokenizer_class": "PreTrainedTokenizerFast"
 
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
+ "model_max_length": 4096,
2061
  "pad_token": "<|finetune_right_pad_id|>",
2062
  "padding_side": "right",
2063
  "tokenizer_class": "PreTrainedTokenizerFast"
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:853ddd18b70a28f1283a65a0fd6510ad6b8acb6481280d62e309becc95fbad13
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145ed827bee3a57b7ae1ffa2e2548128c776658cc3e524c09ac1865e2a584bf7
3
  size 6776