lym0302 commited on
Commit
dcf3642
·
1 Parent(s): bfceb04
Files changed (30) hide show
  1. third_party/MMAudio/mmaudio/ext/rotary_embeddings.py +3 -3
  2. third_party/VideoLLaMA2/README.md +0 -365
  3. third_party/VideoLLaMA2/pyproject.toml +0 -41
  4. third_party/VideoLLaMA2/requirements.txt +0 -42
  5. third_party/VideoLLaMA2/scripts/custom/finetune.sh +0 -73
  6. third_party/VideoLLaMA2/scripts/custom/finetune_audio.sh +0 -72
  7. third_party/VideoLLaMA2/scripts/custom/finetune_lora.sh +0 -74
  8. third_party/VideoLLaMA2/scripts/custom/finetune_qlora.sh +0 -74
  9. third_party/VideoLLaMA2/scripts/custom/pretrain_audio.sh +0 -70
  10. third_party/VideoLLaMA2/scripts/custom/va_joint.sh +0 -80
  11. third_party/VideoLLaMA2/scripts/eval/eval_audio_TUT2017.sh +0 -44
  12. third_party/VideoLLaMA2/scripts/eval/eval_audio_clothoAQA.sh +0 -45
  13. third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVQA.sh +0 -44
  14. third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSD.sh +0 -47
  15. third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSSD.sh +0 -47
  16. third_party/VideoLLaMA2/scripts/eval/eval_audio_vocalsound.sh +0 -44
  17. third_party/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh +0 -67
  18. third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh +0 -41
  19. third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh +0 -46
  20. third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh +0 -45
  21. third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh +0 -84
  22. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_activitynet.sh +0 -54
  23. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_msvd.sh +0 -54
  24. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh +0 -58
  25. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh +0 -58
  26. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh +0 -58
  27. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh +0 -54
  28. third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh +0 -54
  29. third_party/VideoLLaMA2/scripts/vllava/finetune.sh +0 -73
  30. third_party/VideoLLaMA2/scripts/vllava/pretrain.sh +0 -73
third_party/MMAudio/mmaudio/ext/rotary_embeddings.py CHANGED
@@ -7,7 +7,7 @@ from torch import Tensor
7
  # Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
8
  # Ref: https://github.com/lucidrains/rotary-embedding-torch
9
 
10
-
11
  def compute_rope_rotations(length: int,
12
  dim: int,
13
  theta: int,
@@ -16,7 +16,7 @@ def compute_rope_rotations(length: int,
16
  device: Union[torch.device, str] = 'cpu') -> Tensor:
17
  assert dim % 2 == 0
18
 
19
- with torch.amp.autocast(device_type='cuda', enabled=False):
20
  pos = torch.arange(length, dtype=torch.float32, device=device)
21
  freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
22
  freqs *= freq_scaling
@@ -28,7 +28,7 @@ def compute_rope_rotations(length: int,
28
 
29
 
30
  def apply_rope(x: Tensor, rot: Tensor) -> Tuple[Tensor, Tensor]:
31
- with torch.amp.autocast(device_type='cuda', enabled=False):
32
  _x = x.float()
33
  _x = _x.view(*_x.shape[:-1], -1, 1, 2)
34
  x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]
 
7
  # Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
8
  # Ref: https://github.com/lucidrains/rotary-embedding-torch
9
 
10
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  def compute_rope_rotations(length: int,
12
  dim: int,
13
  theta: int,
 
16
  device: Union[torch.device, str] = 'cpu') -> Tensor:
17
  assert dim % 2 == 0
18
 
19
+ with torch.amp.autocast(device_type=DEVICE, enabled=False):
20
  pos = torch.arange(length, dtype=torch.float32, device=device)
21
  freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
22
  freqs *= freq_scaling
 
28
 
29
 
30
  def apply_rope(x: Tensor, rot: Tensor) -> Tuple[Tensor, Tensor]:
31
+ with torch.amp.autocast(device_type=DEVICE, enabled=False):
32
  _x = x.float()
33
  _x = _x.view(*_x.shape[:-1], -1, 1, 2)
34
  x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]
third_party/VideoLLaMA2/README.md DELETED
@@ -1,365 +0,0 @@
1
- <p align="center">
2
- <img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/e7bc34e0e9a96d77947a75b54399d9f96ccf209d/assets/logo.png" width="150" style="margin-bottom: 0.2;"/>
3
- <p>
4
-
5
- <h3 align="center"><a href="https://arxiv.org/abs/2406.07476" style="color:#9C276A">
6
- VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</a></h3>
7
- <h5 align="center"> If our project helps you, please give us a star ⭐ on GitHub to support us. 🙏🙏 </h2>
8
-
9
- <h5 align="center">
10
-
11
- [![hf_space](https://img.shields.io/badge/🤗-Demo-9C276A.svg)](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
12
- [![hf_checkpoint](https://img.shields.io/badge/🤗-Checkpoints-9C276A.svg)](https://huggingface.co/collections/DAMO-NLP-SG/videollama-2-6669b6b6f0493188305c87ed)
13
- [![hf_data](https://img.shields.io/badge/🤗-MSVC-9C276A.svg)](https://huggingface.co/datasets/DAMO-NLP-SG/Multi-Source-Video-Captioning)
14
- [![arXiv](https://img.shields.io/badge/Arxiv-2406.07476-AD1C18.svg?logo=arXiv)](https://arxiv.org/abs/2406.07476) <br>
15
- [![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/main/LICENSE)
16
- [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideoLLaMA2&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitor&edge_flat=false)](https://hits.seeyoufarm.com)
17
- [![GitHub issues](https://img.shields.io/github/issues/DAMO-NLP-SG/VideoLLaMA2?color=critical&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aopen+is%3Aissue)
18
- [![GitHub closed issues](https://img.shields.io/github/issues-closed/DAMO-NLP-SG/VideoLLaMA2?color=success&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aissue+is%3Aclosed) <br>
19
-
20
- </h5>
21
-
22
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-egoschema-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-egoschema-1?p=videollama-2-advancing-spatial-temporal) <br>
23
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-perception-test)](https://paperswithcode.com/sota/video-question-answering-on-perception-test?p=videollama-2-advancing-spatial-temporal) <br>
24
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-mvbench)](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=videollama-2-advancing-spatial-temporal) <br>
25
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme-1?p=videollama-2-advancing-spatial-temporal) <br>
26
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme?p=videollama-2-advancing-spatial-temporal) <br>
27
-
28
- <details open><summary>💡 Some other multimodal-LLM projects from our team may interest you ✨. </summary><p>
29
- <!-- may -->
30
-
31
- > [**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**](https://github.com/DAMO-NLP-SG/Video-LLaMA) <br>
32
- > Hang Zhang, Xin Li, Lidong Bing <br>
33
- [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/Video-LLaMA) [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/Video-LLaMA.svg?style=social)](https://github.com/DAMO-NLP-SG/Video-LLaMA) [![arXiv](https://img.shields.io/badge/Arxiv-2306.02858-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2306.02858) <br>
34
-
35
- > [**VCD: Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding**](https://arxiv.org/abs/2311.16922) <br>
36
- > Sicong Leng, Hang Zhang, Guanzheng Chen, Xin Li, Shijian Lu, Chunyan Miao, Lidong Bing <br>
37
- [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/VCD) [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/VCD.svg?style=social)](https://github.com/DAMO-NLP-SG/VCD) [![arXiv](https://img.shields.io/badge/Arxiv-2311.16922-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16922) <br>
38
-
39
- > [**The Curse of Multi-Modalities: Evaluating Hallucinations of Large Multimodal Models across Language, Visual, and Audio**](https://arxiv.org/abs/2410.12787) <br>
40
- > Sicong Leng, Yun Xing, Zesen Cheng, Yang Zhou, Hang Zhang, Xin Li, Deli Zhao, Shijian Lu, Chunyan Miao, Lidong Bing <br>
41
- [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/CMM) [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/CMM.svg?style=social)](https://github.com/DAMO-NLP-SG/CMM) [![arXiv](https://img.shields.io/badge/Arxiv-2410.12787-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.12787) <br>
42
-
43
- </p></details>
44
-
45
- <div align="center"><video src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/e0e7951c-f392-42ed-afad-b2c7984d3e38" width="800"></div>
46
-
47
-
48
- ## 📰 News
49
- * **[2024.10.22]** Release checkpoints of [VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV).
50
- * **[2024.10.15]** Release checkpoints of [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) and [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F).
51
- * **[2024.08.14]** Release checkpoints of [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base) and [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B).
52
- * **[2024.07.30]** Release checkpoints of [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) and [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B).
53
- * **[2024.06.25]** 🔥🔥 As of Jun 25, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [MLVU Leaderboard](https://github.com/JUNJIE99/MLVU?tab=readme-ov-file#trophy-mini-leaderboard).
54
- * **[2024.06.18]** 🔥🔥 As of Jun 18, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [VideoMME Leaderboard](https://video-mme.github.io/home_page.html#leaderboard).
55
- * **[2024.06.17]** 👋👋 Update technical report with the latest results and the missing references. If you have works closely related to VideoLLaMA 2 but not mentioned in the paper, feel free to let us know.
56
- * **[2024.06.14]** 🔥🔥 [Online Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2) is available.
57
- * **[2024.06.03]** Release training, evaluation, and serving codes of VideoLLaMA 2.
58
-
59
-
60
- <img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/b9faf24f-bdd2-4728-9385-acea17ea086d" width="800" />
61
-
62
- ## 🛠️ Requirements and Installation
63
- Basic Dependencies:
64
- * Python >= 3.8
65
- * Pytorch >= 2.2.0
66
- * CUDA Version >= 11.8
67
- * transformers == 4.40.0 (for reproducing paper results)
68
- * tokenizers == 0.19.1
69
-
70
- **[Online Mode]** Install required packages (better for development):
71
- ```bash
72
- git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
73
- cd VideoLLaMA2
74
- git checkout audio_visual
75
- pip install -r requirements.txt
76
- pip install flash-attn==2.5.8 --no-build-isolation
77
- pip install opencv-python==4.5.5.64
78
- apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
79
- ```
80
-
81
- **[Offline Mode]** Install VideoLLaMA2 as a Python package (better for direct use):
82
- ```bash
83
- git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
84
- cd VideoLLaMA2
85
- git checkout audio_visual
86
- pip install --upgrade pip # enable PEP 660 support
87
- pip install -e .
88
- pip install flash-attn==2.5.8 --no-build-isolation
89
- pip install opencv-python==4.5.5.64
90
- apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
91
- ```
92
-
93
- ## 🚀 Main Results
94
-
95
- ### Multi-Choice Video QA & Video Captioning
96
- <p><img src="https://github.com/user-attachments/assets/e87fe4cf-07ea-4fde-998b-a0c63671c3b4" width="800" "/></p>
97
-
98
- ### Open-Ended Video QA
99
- <p><img src="https://github.com/user-attachments/assets/80b16c04-75ac-43b8-bc22-6952fdf994bb" width="800" "/></p>
100
-
101
- ### Audio QA
102
- <p><img src="https://github.com/user-attachments/assets/46e55952-5a54-4564-bcd4-cfa4edd7f36a" width="800" "/></p>
103
-
104
- ### Audio-Visual QA
105
- <p><img src="https://github.com/user-attachments/assets/8114c1e3-7f93-401b-9ea6-9ce7c96d7b05" width="800" "/></p>
106
-
107
-
108
- ## :earth_americas: Model Zoo
109
- ### Vision-only Checkpoints
110
- | Model Name | Model Type | Visual Encoder | Language Decoder | # Training Frames |
111
- |:----------------|:------------:|:----------------|:------------------|:----------------:|
112
- | [VideoLLaMA2-7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 8 |
113
- | [VideoLLaMA2-7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 8 |
114
- | [VideoLLaMA2-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 16 |
115
- | [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 16 |
116
- | [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
117
- | [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
118
- | [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | 8 |
119
- | [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | 8 |
120
- | [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) | Base | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 16 |
121
- | [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F) | Chat | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 16 |
122
-
123
- ### Audio-Visual Checkpoints
124
- | Model Name | Type | Audio Encoder | Language Decoder |
125
- |:-------------------|:----------------|:----------------|:------------------|
126
- | [VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV) | Chat | [Fine-tuned BEATs_iter3+(AS2M)(cpt2)](https://1drv.ms/u/s!AqeByhGUtINrgcpj8ujXH1YUtxooEg?e=E9Ncea) | [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F) |
127
-
128
-
129
- ## [🤗 Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2-AV)
130
-
131
- It is highly recommended to try our [online demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2-AV) first.
132
-
133
- To run a video-based LLM (Large Language Model) web demonstration on your device, you will first need to ensure that you have the necessary model checkpoints prepared, followed by adhering to the steps outlined to successfully launch the demo.
134
-
135
- ### Single-model Version
136
-
137
- * Launch a gradio app directly ([VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV) is adopted by default):
138
- ```bash
139
- python videollama2/serve/gradio_web_server_adhoc_av.py
140
- ```
141
-
142
- ## 🗝️ Training & Evaluation
143
-
144
- ### Quick Start
145
-
146
- To facilitate further development on top of our codebase, we provide a quick-start guide on how to train a customized [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2) with [VideoLLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) dataset and evaluate the trained model on the mainstream video-llm benchmarks.
147
-
148
- 1. Training Data Structure:
149
- Follow the main branch(https://github.com/DAMO-NLP-SG/VideoLLaMA2/tree/main) of this VideoLLaMA2 codebase.
150
- 2. Command:
151
- ```bash
152
- # VideoLLaMA2.1-audio pretraining
153
- bash scripts/custom/pretrain_audio.sh
154
- # VideoLLaMA2.1-audio finetuning
155
- bash scripts/custom/finetune_audio.sh
156
- # VideoLLaMA2.1-audio_visual finetuning
157
- bash scripts/custom/va_joint.sh
158
- ```
159
- 3. Evaluation Data Structure:
160
- Follow the main branch(https://github.com/DAMO-NLP-SG/VideoLLaMA2/tree/main) of this VideoLLaMA2 codebase.
161
-
162
- 4. Command:
163
- ```bash
164
- # ClothoAQA.sh evaluation
165
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_clothoAQA.sh
166
- # TUT2017 evaluation
167
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_TUT2017.sh
168
- # VocalSound evaluation
169
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_vocalsound.sh
170
- # AVQA_music evaluation
171
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVQA.sh
172
- # AVSD evaluation (need to set azure openai key/endpoint/deployname)
173
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVSD.sh
174
- # AVSSD evaluation (need to set azure openai key/endpoint/deployname)
175
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVSSD.sh
176
- ```
177
-
178
- ### Data Format
179
-
180
- If you want to train a video-llm on your data, you need to follow the procedures below to prepare the audio/video/image sft data:
181
-
182
- 1. Suppose your data structure is like:
183
- ```bash
184
- VideoLLaMA2
185
- ├── datasets
186
- │ ├── custom_sft
187
- │ | ├── audio
188
- │ | ├── video
189
- │ | ├── image
190
- | | └── custom.json
191
- ```
192
- 2. Then you should re-organize the annotated audio/video/image sft data according to the following format:
193
- ```json
194
- [
195
- {
196
- "id": 0,
197
- "audio": "audio/xxx.wav",
198
- "conversations": [
199
- {
200
- "from": "human",
201
- "value": "<audio>\nPlease describe the sound event within the audio."
202
- },
203
- {
204
- "from": "gpt",
205
- "value": "Loud television static dips in and out of focus."
206
- },
207
- ...
208
- ],
209
- }
210
- {
211
- "id": 1,
212
- "video": "images/xxx.jpg",
213
- "conversations": [
214
- {
215
- "from": "human",
216
- "value": "<image>\nWhat are the colors of the bus in the image?"
217
- },
218
- {
219
- "from": "gpt",
220
- "value": "The bus in the image is white and red."
221
- },
222
- ...
223
- ],
224
- }
225
- {
226
- "id": 2,
227
- "video": "videos/xxx.mp4",
228
- "conversations": [
229
- {
230
- "from": "human",
231
- "value": "<video>\nWhat are the main activities that take place in the video?"
232
- },
233
- {
234
- "from": "gpt",
235
- "value": "The main activities that take place in the video are the preparation of camera equipment by a man, a group of men riding a helicopter, and a man sailing a boat through the water."
236
- },
237
- ...
238
- ],
239
- },
240
- ...
241
- ]
242
- ```
243
- 3. Modify the `scripts/custom/finetune_audio.sh`:
244
- ```bash
245
- ...
246
- --data_path datasets/custom_sft/custom.json
247
- --data_folder datasets/custom_sft/
248
- --pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)
249
- ...
250
- ```
251
- 4. Modify the `scripts/custom/va_joint.sh`:
252
- ```bash
253
- ...
254
- --data_path datasets/custom_sft/custom.json
255
- --data_folder datasets/custom_sft/
256
- --pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)
257
- ...
258
- ```
259
-
260
- ## 🤖 Inference
261
-
262
- Audio/Video-Audio Inference:
263
- ```python
264
- import sys
265
- sys.path.append('./')
266
- from videollama2 import model_init, mm_infer
267
- from videollama2.utils import disable_torch_init
268
- import argparse
269
-
270
- def inference(args):
271
-
272
- model_path = args.model_path
273
- model, processor, tokenizer = model_init(model_path)
274
-
275
- if args.modal_type == "a":
276
- model.model.vision_tower = None
277
- elif args.modal_type == "v":
278
- model.model.audio_tower = None
279
- elif args.modal_type == "av":
280
- pass
281
- else:
282
- raise NotImplementedError
283
- # Audio-visual Inference
284
- audio_video_path = "assets/00000368.mp4"
285
- preprocess = processor['audio' if args.modal_type == "a" else "video"]
286
- if args.modal_type == "a":
287
- audio_video_tensor = preprocess(audio_video_path)
288
- else:
289
- audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
290
- question = f"Who plays the instrument louder?"
291
-
292
- # Audio Inference
293
- audio_video_path = "assets/bird-twitter-car.wav"
294
- preprocess = processor['audio' if args.modal_type == "a" else "video"]
295
- if args.modal_type == "a":
296
- audio_video_tensor = preprocess(audio_video_path)
297
- else:
298
- audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
299
- question = f"Please describe the audio:"
300
-
301
- # Video Inference
302
- audio_video_path = "assets/output_v_1jgsRbGzCls.mp4"
303
- preprocess = processor['audio' if args.modal_type == "a" else "video"]
304
- if args.modal_type == "a":
305
- audio_video_tensor = preprocess(audio_video_path)
306
- else:
307
- audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
308
- question = f"What activity are the people practicing in the video?"
309
-
310
- output = mm_infer(
311
- audio_video_tensor,
312
- question,
313
- model=model,
314
- tokenizer=tokenizer,
315
- modal='audio' if args.modal_type == "a" else "video",
316
- do_sample=False,
317
- )
318
-
319
- print(output)
320
-
321
-
322
- if __name__ == "__main__":
323
- parser = argparse.ArgumentParser()
324
-
325
- parser.add_argument('--model-path', help='', required=True)
326
- parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
327
- args = parser.parse_args()
328
-
329
- inference(args)
330
-
331
- ```
332
-
333
- ## 📑 Citation
334
-
335
- If you find VideoLLaMA useful for your research and applications, please cite using this BibTeX:
336
- ```bibtex
337
- @article{damonlpsg2024videollama2,
338
- title={VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs},
339
- author={Cheng, Zesen and Leng, Sicong and Zhang, Hang and Xin, Yifei and Li, Xin and Chen, Guanzheng and Zhu, Yongxin and Zhang, Wenqi and Luo, Ziyang and Zhao, Deli and Bing, Lidong},
340
- journal={arXiv preprint arXiv:2406.07476},
341
- year={2024},
342
- url = {https://arxiv.org/abs/2406.07476}
343
- }
344
-
345
- @article{damonlpsg2023videollama,
346
- title = {Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding},
347
- author = {Zhang, Hang and Li, Xin and Bing, Lidong},
348
- journal = {arXiv preprint arXiv:2306.02858},
349
- year = {2023},
350
- url = {https://arxiv.org/abs/2306.02858}
351
- }
352
- ```
353
-
354
- ## 👍 Acknowledgement
355
- The codebase of VideoLLaMA 2 is adapted from [**LLaVA 1.5**](https:github.com/haotian-liu/LLaVA) and [**FastChat**](https://github.com/lm-sys/FastChat). We are also grateful for the following projects our VideoLLaMA 2 arise from:
356
- * [**LLaMA 2**](https://github.com/meta-llama/llama), [**Mistral-7B**](https://mistral.ai/news/announcing-mistral-7b/), [**OpenAI CLIP**](https://openai.com/index/clip/), [**Honeybee**](https://github.com/kakaobrain/honeybee).
357
- * [**Video-ChatGPT**](https://github.com/mbzuai-oryx/Video-ChatGPT), [**Video-LLaVA**](https://github.com/PKU-YuanGroup/Video-LLaVA).
358
- * [**WebVid**](https://github.com/m-bain/webvid), [**Panda-70M**](https://github.com/snap-research/Panda-70M), [**LanguageBind**](https://github.com/PKU-YuanGroup/LanguageBind), [**InternVid**](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid).
359
- * [**VideoChat2**](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2), [**Valley**](https://github.com/RupertLuo/Valley), [**VTimeLLM**](https://github.com/huangb23/VTimeLLM), [**ShareGPT4V**](https://sharegpt4v.github.io/).
360
-
361
-
362
- ## 🔒 License
363
-
364
- This project is released under the Apache 2.0 license as found in the LICENSE file.
365
- The service is a research preview intended for **non-commercial use ONLY**, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/pyproject.toml DELETED
@@ -1,41 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=61.0"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "videollama2"
7
- version = "1.0"
8
- description = "Release of VideoLLaMA2"
9
- readme = "README.md"
10
- requires-python = ">=3.8"
11
- classifiers = [
12
- "Programming Language :: Python :: 3",
13
- "License :: OSI Approved :: Apache Software License",
14
- ]
15
- dependencies = [
16
- "torch==2.2.0", "torchvision==0.17.0", "torchaudio==2.2.0", "librosa",
17
- "transformers==4.42.3", "tokenizers==0.19.1",
18
- "deepspeed==0.13.1", "accelerate==0.26.1",
19
- "peft==0.4.0", "timm==1.0.3", "numpy==1.24.4",
20
- "decord==0.6.0", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
21
- "moviepy==1.0.3", "scenedetect==0.6.3",
22
- "opencv-python==4.6.0.66", "pysubs2",
23
- "scikit-learn==1.2.2", "huggingface_hub==0.23.4", "sentencepiece==0.1.99",
24
- "shortuuid", "einops==0.6.1", "einops-exts==0.0.4",
25
- "bitsandbytes==0.43.0", "pydantic>=2.0", "markdown2[all]",
26
- "gradio==3.50.0", "gradio_client==0.6.1", "httpx==0.24.1",
27
- "requests", "openai", "uvicorn", "fastapi", "tensorboard", "wandb", "tabulate"
28
- ]
29
-
30
- [project.optional-dependencies]
31
- train = ["ninja"]
32
-
33
- [project.urls]
34
- "Homepage" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2"
35
- "Bug Tracker" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues"
36
-
37
- [tool.setuptools.packages.find]
38
- exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
39
-
40
- [tool.wheel]
41
- exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/requirements.txt DELETED
@@ -1,42 +0,0 @@
1
- --extra-index-url https://download.pytorch.org/whl/cu118
2
- # basic dependencies
3
- torch==2.2.0
4
- torchaudio==2.2.0
5
- torchvision==0.17.0
6
- transformers==4.42.3
7
- tokenizers==0.19.1
8
- deepspeed==0.13.1
9
- accelerate==0.26.1
10
- peft==0.4.0
11
- timm==1.0.3
12
- numpy==1.24.4
13
- # data processing
14
- decord==0.6.0
15
- imageio==2.34.0
16
- imageio-ffmpeg==0.4.9
17
- moviepy==1.0.3
18
- scenedetect==0.6.3
19
- opencv-python==4.6.0.66
20
- pysubs2
21
- librosa
22
- pytorchvideo
23
- # misc
24
- scikit-learn==1.2.2
25
- huggingface_hub==0.23.4
26
- sentencepiece==0.1.99
27
- shortuuid
28
- einops==0.6.1
29
- einops-exts==0.0.4
30
- bitsandbytes==0.43.0
31
- pydantic>=2.0
32
- markdown2[all]
33
- gradio==3.50.0
34
- gradio_client==0.6.1
35
- httpx==0.24.1
36
- openai==1.33.0
37
- requests
38
- uvicorn
39
- fastapi
40
- tensorboard
41
- wandb
42
- tabulate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/finetune.sh DELETED
@@ -1,73 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=${3:-0}
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
- RUN_NAME=siglip_tcv35_7b_16f
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train.py \
42
- --deepspeed scripts/zero3.json \
43
- --model_type videollama2_qwen2 \
44
- --model_path Qwen/Qwen2-7B-Instruct \
45
- --vision_tower google/siglip-so400m-patch14-384 \
46
- --mm_projector_type stc_connector_v35 \
47
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
48
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
- --data_folder ${DATA_DIR}/videollava_sft/ \
50
- --mm_vision_select_layer -2 \
51
- --image_aspect_ratio pad \
52
- --num_frames 16 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --save_strategy "steps" \
62
- --save_steps 500 \
63
- --save_total_limit 99 \
64
- --learning_rate 2e-5 \
65
- --weight_decay 0. \
66
- --warmup_ratio 0.03 \
67
- --lr_scheduler_type "cosine" \
68
- --logging_steps 1 \
69
- --model_max_length 2048 \
70
- --gradient_checkpointing True \
71
- --dataloader_num_workers 4 \
72
- --report_to tensorboard \
73
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/finetune_audio.sh DELETED
@@ -1,72 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=audio_stage2_qwen2
32
- RUN_NAME=audio_stage2_qwen2
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
- torchrun --nnodes $WORLD_SIZE \
36
- --nproc_per_node $NPROC_PER_NODE \
37
- --master_addr=$MASTER_ADDR \
38
- --master_port=$MASTER_PORT \
39
- --node_rank $RANK \
40
- videollama2/train.py \
41
- --deepspeed scripts/zero2.json \
42
- --model_type videollama2_qwen2 \
43
- --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
44
- --data_path_a ${DATA_DIR}/stage2_audio_text.json \
45
- --audio_tower ./BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
46
- --pretrain_mm_mlp_adapter_a $OUTP_DIR/mm_projector_a.bin \
47
- --mm_projector_a_type mlp2x_gelu \
48
- --tune_mm_mlp_adapter_a True \
49
- --tune_audio_tower True \
50
- --bf16 True \
51
- --tf32 True \
52
- --fp16 False \
53
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
54
- --num_train_epochs 2 \
55
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
56
- --per_device_eval_batch_size 4 \
57
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
58
- --evaluation_strategy "no" \
59
- --save_strategy "steps" \
60
- --save_steps 2000 \
61
- --save_total_limit 2 \
62
- --learning_rate 2e-5 \
63
- --weight_decay 0. \
64
- --warmup_ratio 0.03 \
65
- --lr_scheduler_type "cosine" \
66
- --logging_steps 1 \
67
- --model_max_length 2048 \
68
- --gradient_checkpointing True \
69
- --dataloader_num_workers 4 \
70
- --lazy_preprocess True \
71
- --report_to tensorboard \
72
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/finetune_lora.sh DELETED
@@ -1,74 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=${3:-0}
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
- RUN_NAME=siglip_tcv35_7b_16f_lora
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train.py \
42
- --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_qwen2 \
45
- --model_path Qwen/Qwen2-7B-Instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 16 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 1 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 2e-5 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --report_to tensorboard \
74
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/finetune_qlora.sh DELETED
@@ -1,74 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=${3:-0}
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
- RUN_NAME=siglip_tcv35_7b_16f_qlora
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train.py \
42
- --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --bits 4 \
43
- --deepspeed scripts/zero2.json \
44
- --model_type videollama2_qwen2 \
45
- --model_path Qwen/Qwen2-7B-Instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 16 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 1 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 2e-5 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --report_to tensorboard \
74
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/pretrain_audio.sh DELETED
@@ -1,70 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
- # Multiple conditions
10
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
11
- WORLD_SIZE=$ARG_WORLD_SIZE
12
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
13
- fi
14
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
15
- MASTER_ADDR=$ARG_MASTER_ADDR
16
- MASTER_PORT=$ARG_MASTER_PORT
17
- RANK=$ARG_RANK
18
- fi
19
-
20
- echo "WORLD_SIZE: $WORLD_SIZE"
21
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
22
-
23
- # Training Arguments
24
- GLOBAL_BATCH_SIZE=1024
25
- LOCAL_BATCH_SIZE=32
26
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
27
-
28
- # Log Arguments
29
- export TRANSFORMERS_OFFLINE=1
30
- export WANDB_PROJECT=videollama2qwen2_audio_stage1
31
- RUN_NAME=videollama2qwen2_audio_stage1
32
- DATA_DIR=datasets
33
- OUTP_DIR=work_dirs
34
- torchrun --nnodes $WORLD_SIZE \
35
- --nproc_per_node $NPROC_PER_NODE \
36
- --master_addr=$MASTER_ADDR \
37
- --master_port=$MASTER_PORT \
38
- --node_rank $RANK \
39
- videollama2/train.py \
40
- --deepspeed scripts/zero2.json \
41
- --model_type videollama2_qwen2 \
42
- --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
43
- --data_path_a ${DATA_DIR}/stage1_pretrain.json \
44
- --audio_tower ./BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
45
- --mm_projector_a_type mlp2x_gelu \
46
- --tune_mm_mlp_adapter_a True \
47
- --mm_vision_select_layer -1 \
48
- --bf16 True \
49
- --tf32 True \
50
- --fp16 False \
51
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
52
- --num_train_epochs 1 \
53
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
54
- --per_device_eval_batch_size 4 \
55
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
56
- --evaluation_strategy "no" \
57
- --save_strategy "steps" \
58
- --save_steps 1000 \
59
- --save_total_limit 1 \
60
- --learning_rate 1e-3 \
61
- --weight_decay 0. \
62
- --warmup_ratio 0.03 \
63
- --lr_scheduler_type "cosine" \
64
- --logging_steps 1 \
65
- --model_max_length 2048 \
66
- --gradient_checkpointing True \
67
- --dataloader_num_workers 4 \
68
- --lazy_preprocess True \
69
- --report_to tensorboard \
70
- --run_name pretrain_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/custom/va_joint.sh DELETED
@@ -1,80 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=audio_visual_stage3_qwen2
32
- RUN_NAME=audio_visual_stage3_qwen2
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
- torchrun --nnodes $WORLD_SIZE \
36
- --nproc_per_node $NPROC_PER_NODE \
37
- --master_addr=$MASTER_ADDR \
38
- --master_port=$MASTER_PORT \
39
- --node_rank $RANK \
40
- videollama2/train.py \
41
- --deepspeed scripts/zero2.json \
42
- --model_type videollama2_qwen2 \
43
- --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
44
- --data_folder ${DATA_DIR} \
45
- --data_path ${DATA_DIR}/stage3_video_audio.json,${DATA_DIR}/stage2_audio_subset_new.json,${DATA_DIR}/stage2_video_subset.json \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --audio_tower $OUTP_DIR/audio_tower.bin \
48
- --pretrain_mm_mlp_adapter_a $OUTP_DIR/mm_projector_a.bin \
49
- --mm_projector_type stc_connector_v35 \
50
- --mm_projector_a_type mlp2x_gelu \
51
- --va True \
52
- --tune_audio_tower True \
53
- --tune_adapter_llm True \
54
- --tune_mm_mlp_adapter_a True \
55
- --mm_vision_select_layer -2 \
56
- --image_aspect_ratio pad \
57
- --num_frames 16 \
58
- --bf16 True \
59
- --tf32 True \
60
- --fp16 False \
61
- --output_dir $OUTP_DIR/${WANDB_PROJECT}/VideoLLaMA2.1-7B-AV \
62
- --num_train_epochs 2 \
63
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
64
- --per_device_eval_batch_size 4 \
65
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
66
- --evaluation_strategy "no" \
67
- --save_strategy "steps" \
68
- --save_steps 2000 \
69
- --save_total_limit 2 \
70
- --learning_rate 2e-5 \
71
- --weight_decay 0. \
72
- --warmup_ratio 0.03 \
73
- --lr_scheduler_type "cosine" \
74
- --logging_steps 1 \
75
- --model_max_length 2048 \
76
- --gradient_checkpointing True \
77
- --dataloader_num_workers 4 \
78
- --lazy_preprocess True \
79
- --report_to tensorboard \
80
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_TUT2017.sh DELETED
@@ -1,44 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
22
- --model-path ${CKPT} \
23
- --dataset TUT2017 \
24
- --video-folder ${EVAL_DATA_DIR}/TUT2017 \
25
- --question-file ${EVAL_DATA_DIR}/TUT2017/tut2017_eval.jsonl \
26
- --answer-file ${EVAL_DATA_DIR}/TUT2017/tut2017_eval.jsonl \
27
- --output-file ${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
28
- --num-chunks $CHUNKS \
29
- --chunk-idx $IDX &
30
- done
31
-
32
- wait
33
-
34
- # Clear out the output file if it exists.
35
- > "$output_file"
36
-
37
- #Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
- fi
42
-
43
- python videollama2/eval/eval_audio_TUT2017.py \
44
- --pred-path ${output_file}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_clothoAQA.sh DELETED
@@ -1,45 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
-
7
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
8
-
9
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10
- IFS=',' read -ra GPULIST <<< "$gpu_list"
11
-
12
- # divide data via the number of GPUs per task
13
- GPUS_PER_TASK=1
14
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
15
-
16
- output_file=${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/merge.json
17
-
18
- if [ ! -f "$output_file" ]; then
19
- for IDX in $(seq 0 $((CHUNKS-1))); do
20
- # select the GPUs for the task
21
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
22
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
23
- --model-path ${CKPT} \
24
- --dataset clothoAQA \
25
- --video-folder ${EVAL_DATA_DIR}/ClothoAQA/audio_files \
26
- --question-file ${EVAL_DATA_DIR}/clothoAQA_eval.json \
27
- --answer-file ${EVAL_DATA_DIR}/clothoAQA_eval.json \
28
- --output-file ${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
29
- --num-chunks $CHUNKS \
30
- --chunk-idx $IDX &
31
- done
32
-
33
- wait
34
-
35
- # Clear out the output file if it exists.
36
- > "$output_file"
37
-
38
- #Loop through the indices and concatenate each file.
39
- for IDX in $(seq 0 $((CHUNKS-1))); do
40
- cat ${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
41
- done
42
- fi
43
-
44
- python videollama2/eval/eval_audio_clothoAQA.py \
45
- --pred-path ${output_file}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVQA.sh DELETED
@@ -1,44 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
22
- --model-path ${CKPT} \
23
- --dataset AVQA \
24
- --video-folder ${EVAL_DATA_DIR}/AVQA_music/MUSIC-AVQA-videos \
25
- --question-file ${EVAL_DATA_DIR}/AVQA_music/AVQA_music_test.json \
26
- --answer-file ${EVAL_DATA_DIR}/AVQA_music/AVQA_music_test.json \
27
- --output-file ${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
28
- --num-chunks $CHUNKS \
29
- --chunk-idx $IDX
30
- done
31
-
32
- wait
33
-
34
- # Clear out the output file if it exists.
35
- > "$output_file"
36
-
37
- #Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
- fi
42
-
43
- python3 videollama2/eval/eval_audio_video_AVQA.py \
44
- --pred-path ${output_file}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSD.sh DELETED
@@ -1,47 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
22
- --model-path ${CKPT} \
23
- --dataset AVSD \
24
- --video-folder ${EVAL_DATA_DIR}/AVSD/Charades_v1_480 \
25
- --question-file ${EVAL_DATA_DIR}/AVSD/instruction_val.json \
26
- --answer-file ${EVAL_DATA_DIR}/AVSD/instruction_val.json \
27
- --output-file ${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
28
- --num-chunks $CHUNKS \
29
- --chunk-idx $IDX
30
- done
31
-
32
- wait
33
-
34
- # Clear out the output file if it exists.
35
- > "$output_file"
36
-
37
- #Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
- fi
42
-
43
- python videollama2/eval/eval_audio_video_AVSD.py \
44
- --pred-path /mnt/data/xyf/VideoLLaMA2_backup/eval_output/AVSD/answers/vlb_audio_visual_stage3_tuning_projector_beats_qwen2_videollm_ep2/merge.json \
45
- --api-key $AZURE_API_KEY \
46
- --api-endpoint $AZURE_API_ENDPOINT \
47
- --api-deployname $AZURE_API_DEPLOYNAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSSD.sh DELETED
@@ -1,47 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
22
- --model-path ${CKPT} \
23
- --dataset AVSSD \
24
- --video-folder ${EVAL_DATA_DIR}/VGGSound_final/video \
25
- --question-file ${EVAL_DATA_DIR}/avssd_test.json \
26
- --answer-file ${EVAL_DATA_DIR}/avssd_test.json \
27
- --output-file ${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
28
- --num-chunks $CHUNKS \
29
- --chunk-idx $IDX
30
- done
31
-
32
- wait
33
-
34
- # Clear out the output file if it exists.
35
- > "$output_file"
36
-
37
- #Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
- fi
42
-
43
- python videollama2/eval/eval_audio_video_AVSSD.py \
44
- --pred-path ${output_file} \
45
- --api-key f68a11a54a064caa851e290258d52cce \
46
- --api-endpoint https://vl-australiaeast.openai.azure.com/ \
47
- --api-deployname gpt35-turbo-0613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_audio_vocalsound.sh DELETED
@@ -1,44 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
22
- --model-path ${CKPT} \
23
- --dataset vocalsound \
24
- --video-folder ${EVAL_DATA_DIR}/vocal/audio_16k \
25
- --question-file ${EVAL_DATA_DIR}/vocal/vocalsound_eval.jsonl \
26
- --answer-file ${EVAL_DATA_DIR}/vocal/vocalsound_eval.jsonl \
27
- --output-file ${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
28
- --num-chunks $CHUNKS \
29
- --chunk-idx $IDX &
30
- done
31
-
32
- wait
33
-
34
- # Clear out the output file if it exists.
35
- > "$output_file"
36
-
37
- #Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
- fi
42
-
43
- python videollama2/eval/eval_audio_vocalsound.py \
44
- --pred-path ${output_file}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh DELETED
@@ -1,67 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/merge.json
16
-
17
- # judge if the number of json lines is 0
18
- if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
19
- rm -f ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/*.json
20
- fi
21
-
22
- if [ ! -f "$output_file" ]; then
23
- for IDX in $(seq 0 $((CHUNKS-1))); do
24
- # select the GPUs for the task
25
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
26
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_cap_msvc.py \
27
- --model-path ${CKPT} \
28
- --video-folder ${EVAL_DATA_DIR}/msvc \
29
- --question-file ${EVAL_DATA_DIR}/msvc/msvc.json \
30
- --output-file ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
31
- --num-chunks $CHUNKS \
32
- --chunk-idx $IDX &
33
- done
34
-
35
- wait
36
-
37
- # Clear out the output file if it exists.
38
- > "$output_file"
39
-
40
- #Loop through the indices and concatenate each file.
41
- for IDX in $(seq 0 $((CHUNKS-1))); do
42
- cat ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
43
- done
44
- fi
45
-
46
-
47
- AZURE_API_KEY=your_key
48
- AZURE_API_ENDPOINT=your_endpoint
49
- AZURE_API_DEPLOYNAME=your_deployname
50
-
51
- python3 videollama2/eval/eval_video_cap_msvc_correctness.py \
52
- --pred-path $output_file \
53
- --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_gpt \
54
- --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_results.json \
55
- --api-key $AZURE_API_KEY \
56
- --api-endpoint $AZURE_API_ENDPOINT \
57
- --api-deployname $AZURE_API_DEPLOYNAME \
58
- --num-tasks 4 \
59
-
60
- python3 videollama2/eval/eval_video_cap_msvc_detailedness.py \
61
- --pred-path $output_file \
62
- --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_gpt \
63
- --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_results.json \
64
- --api-key $AZURE_API_KEY \
65
- --api-endpoint $AZURE_API_ENDPOINT \
66
- --api-deployname $AZURE_API_DEPLOYNAME \
67
- --num-tasks 4 \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh DELETED
@@ -1,41 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/merge.csv
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_egoschema.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/egoschema/good_clips_git \
24
- --question-file ${EVAL_DATA_DIR}/egoschema/questions.json \
25
- --answer-file ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv \
26
- --num-chunks $CHUNKS \
27
- --chunk-idx $IDX &
28
- done
29
-
30
- wait
31
-
32
- # Clear out the output file if it exists.
33
- > "$output_file"
34
-
35
- echo 'q_uid, answer' >> "$output_file"
36
-
37
- # Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv >> "$output_file"
40
- done
41
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh DELETED
@@ -1,46 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/merge.json
16
-
17
- # judge if the number of json lines is 0
18
- if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
19
- rm -f ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/*.json
20
- fi
21
-
22
- if [ ! -f "$output_file" ]; then
23
- for IDX in $(seq 0 $((CHUNKS-1))); do
24
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
25
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_mvbench.py \
26
- --model-path ${CKPT} \
27
- --video-folder ${EVAL_DATA_DIR}/mvbench/video \
28
- --question-file ${EVAL_DATA_DIR}/mvbench/json \
29
- --answer-file ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
30
- --num-chunks $CHUNKS \
31
- --chunk-idx $IDX &
32
- done
33
-
34
- wait
35
-
36
- # Clear out the output file if it exists.
37
- > "$output_file"
38
-
39
- # Loop through the indices and concatenate each file.
40
- for IDX in $(seq 0 $((CHUNKS-1))); do
41
- cat ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
42
- done
43
- fi
44
-
45
- python3 videollama2/eval/eval_video_mcqa_mvbench.py \
46
- --pred_path ${output_file} \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh DELETED
@@ -1,45 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_perception_test_mcqa.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/perception_test_mcqa/videos \
24
- --question-file ${EVAL_DATA_DIR}/perception_test_mcqa/mc_question_test.json \
25
- --answer-file ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
26
- --num-chunks $CHUNKS \
27
- --chunk-idx $IDX &
28
- done
29
-
30
- wait
31
-
32
- # Clear out the output file if it exists.
33
- > "$output_file"
34
-
35
- echo "{" >> "$output_file"
36
-
37
- # Loop through the indices and concatenate each file.
38
- for IDX in $(seq 0 $((CHUNKS-1))); do
39
- cat ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
40
- done
41
-
42
- sed -i '$s/.$//' $output_file
43
-
44
- echo "}" >> "$output_file"
45
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh DELETED
@@ -1,84 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge.json
16
- output_sub_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge_sub.json
17
-
18
- # judge if the number of json lines is 0
19
- if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
20
- rm -f ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/*.json
21
- fi
22
-
23
-
24
- if [ ! -f "$output_file" ]; then
25
- for IDX in $(seq 0 $((CHUNKS-1))); do
26
- # select the GPUs for the task
27
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
28
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_videomme.py \
29
- --model-path ${CKPT} \
30
- --video-folder ${EVAL_DATA_DIR}/videomme/videos \
31
- --subtitle-folder ${EVAL_DATA_DIR}/videomme/subtitles \
32
- --question-file ${EVAL_DATA_DIR}/videomme/test-00000-of-00001.parquet \
33
- --answer-file ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
34
- --num-chunks $CHUNKS \
35
- --chunk-idx $IDX &
36
- done
37
-
38
- wait
39
-
40
- # Clear out the output file if it exists.
41
- > "$output_file"
42
-
43
- echo "[" >> "$output_file"
44
-
45
- #Loop through the indices and concatenate each file.
46
- for IDX in $(seq 0 $((CHUNKS-1))); do
47
- cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
48
- done
49
-
50
- sed -i '$s/.$//' $output_file
51
-
52
- echo "]" >> "$output_file"
53
-
54
- # Clear out the output file if it exists.
55
- > "$output_sub_file"
56
-
57
- echo "[" >> "$output_sub_file"
58
-
59
- #Loop through the indices and concatenate each file.
60
- for IDX in $(seq 0 $((CHUNKS-1))); do
61
- cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}_sub.json >> "$output_sub_file"
62
- done
63
-
64
- sed -i '$s/.$//' $output_sub_file
65
-
66
- echo "]" >> "$output_sub_file"
67
- fi
68
-
69
-
70
- python videollama2/eval/eval_video_mcqa_videomme.py \
71
- --results_file $output_file \
72
- --video_duration_type "short,medium,long" \
73
- --return_categories_accuracy \
74
- --return_sub_categories_accuracy \
75
- --return_task_types_accuracy \
76
- --skip_missing \
77
-
78
- python videollama2/eval/eval_video_mcqa_videomme.py \
79
- --results_file $output_sub_file \
80
- --video_duration_type "short,medium,long" \
81
- --return_categories_accuracy \
82
- --return_sub_categories_accuracy \
83
- --return_task_types_accuracy \
84
- --skip_missing \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_activitynet.sh DELETED
@@ -1,54 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/all_test \
24
- --question-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_q.json \
25
- --answer-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_a.json \
26
- --output-file ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
27
- --num-chunks $CHUNKS \
28
- --chunk-idx $IDX &
29
- done
30
-
31
- wait
32
-
33
- # Clear out the output file if it exists.
34
- > "$output_file"
35
-
36
- #Loop through the indices and concatenate each file.
37
- for IDX in $(seq 0 $((CHUNKS-1))); do
38
- cat ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
39
- done
40
- fi
41
-
42
-
43
- AZURE_API_KEY=your_key
44
- AZURE_API_ENDPOINT=your_endpoint
45
- AZURE_API_DEPLOYNAME=your_deployname
46
-
47
- python3 videollama2/eval/eval_video_oqa_activitynet.py \
48
- --pred-path ${output_file} \
49
- --output-dir ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
50
- --output-json ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
51
- --api-key $AZURE_API_KEY \
52
- --api-endpoint $AZURE_API_ENDPOINT \
53
- --api-deployname $AZURE_API_DEPLOYNAME \
54
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_msvd.sh DELETED
@@ -1,54 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/videos \
24
- --question-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_q.json \
25
- --answer-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_a.json \
26
- --output-file ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
27
- --num-chunks $CHUNKS \
28
- --chunk-idx $IDX &
29
- done
30
-
31
- wait
32
-
33
- # Clear out the output file if it exists.
34
- > "$output_file"
35
-
36
- #Loop through the indices and concatenate each file.
37
- for IDX in $(seq 0 $((CHUNKS-1))); do
38
- cat ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
39
- done
40
- fi
41
-
42
-
43
- AZURE_API_KEY=your_key
44
- AZURE_API_ENDPOINT=your_endpoint
45
- AZURE_API_DEPLOYNAME=your_deployname
46
-
47
- python3 videollama2/eval/eval_video_oqa_activitynet.py \
48
- --pred-path ${output_file} \
49
- --output-dir ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
50
- --output-json ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
51
- --api-key $AZURE_API_KEY \
52
- --api-endpoint $AZURE_API_ENDPOINT \
53
- --api-deployname $AZURE_API_DEPLOYNAME \
54
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh DELETED
@@ -1,58 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
24
- --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
25
- --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
26
- --num-chunks $CHUNKS \
27
- --chunk-idx $IDX &
28
- done
29
-
30
- wait
31
-
32
- # Clear out the output file if it exists.
33
- > "$output_file"
34
-
35
- #Loop through the indices and concatenate each file.
36
- for IDX in $(seq 0 $((CHUNKS-1))); do
37
- cat ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
38
- done
39
-
40
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
41
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
42
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
43
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
44
- fi
45
-
46
-
47
- AZURE_API_KEY=your_key
48
- AZURE_API_ENDPOINT=your_endpoint
49
- AZURE_API_DEPLOYNAME=your_deployname
50
-
51
- python3 videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py \
52
- --pred-path ${output_file} \
53
- --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/gpt \
54
- --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/results.json \
55
- --api-key $AZURE_API_KEY \
56
- --api-endpoint $AZURE_API_ENDPOINT \
57
- --api-deployname $AZURE_API_DEPLOYNAME \
58
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh DELETED
@@ -1,58 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
24
- --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
25
- --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
26
- --num-chunks $CHUNKS \
27
- --chunk-idx $IDX &
28
- done
29
-
30
- wait
31
-
32
- # Clear out the output file if it exists.
33
- > "$output_file"
34
-
35
- #Loop through the indices and concatenate each file.
36
- for IDX in $(seq 0 $((CHUNKS-1))); do
37
- cat ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
38
- done
39
-
40
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
41
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
42
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
43
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
44
- fi
45
-
46
-
47
- AZURE_API_KEY=your_key
48
- AZURE_API_ENDPOINT=your_endpoint
49
- AZURE_API_DEPLOYNAME=your_deployname
50
-
51
- python3 videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py \
52
- --pred-path ${output_file} \
53
- --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/gpt \
54
- --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/results.json \
55
- --api-key $AZURE_API_KEY \
56
- --api-endpoint $AZURE_API_ENDPOINT \
57
- --api-deployname $AZURE_API_DEPLOYNAME \
58
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh DELETED
@@ -1,58 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
16
-
17
- if [ ! -f "$output_file" ]; then
18
- for IDX in $(seq 0 $((CHUNKS-1))); do
19
- # select the GPUs for the task
20
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
22
- --model-path ${CKPT} \
23
- --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
24
- --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
25
- --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
26
- --num-chunks $CHUNKS \
27
- --chunk-idx $IDX &
28
- done
29
-
30
- wait
31
-
32
- # Clear out the output file if it exists.
33
- > "$output_file"
34
-
35
- #Loop through the indices and concatenate each file.
36
- for IDX in $(seq 0 $((CHUNKS-1))); do
37
- cat ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
38
- done
39
-
40
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
41
- mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
42
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
43
- cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
44
- fi
45
-
46
-
47
- AZURE_API_KEY=your_key
48
- AZURE_API_ENDPOINT=your_endpoint
49
- AZURE_API_DEPLOYNAME=your_deployname
50
-
51
- python3 videollama2/eval/eval_video_oqa_vcgpt_3_context.py \
52
- --pred-path ${output_file} \
53
- --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/gpt \
54
- --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/results.json \
55
- --api-key $AZURE_API_KEY \
56
- --api-endpoint $AZURE_API_ENDPOINT \
57
- --api-deployname $AZURE_API_DEPLOYNAME \
58
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh DELETED
@@ -1,54 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/merge.json
16
-
17
- # if output_file not exists then inference
18
- if [ ! -f "$output_file" ]; then
19
- for IDX in $(seq 0 $((CHUNKS-1))); do
20
- # select the GPUs for the task
21
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
22
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
23
- --model-path ${CKPT} \
24
- --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
25
- --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/temporal_qa.json \
26
- --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
27
- --num-chunks $CHUNKS \
28
- --chunk-idx $IDX &
29
- done
30
-
31
- wait
32
-
33
- # Clear out the output file if it exists.
34
- > "$output_file"
35
-
36
- #Loop through the indices and concatenate each file.
37
- for IDX in $(seq 0 $((CHUNKS-1))); do
38
- cat ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
39
- done
40
- fi
41
-
42
-
43
- AZURE_API_KEY=your_key
44
- AZURE_API_ENDPOINT=your_endpoint
45
- AZURE_API_DEPLOYNAME=your_deployname
46
-
47
- python3 videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py \
48
- --pred-path ${output_file} \
49
- --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/gpt \
50
- --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/results.json \
51
- --api-key $AZURE_API_KEY \
52
- --api-endpoint $AZURE_API_ENDPOINT \
53
- --api-deployname $AZURE_API_DEPLOYNAME \
54
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh DELETED
@@ -1,54 +0,0 @@
1
- set -x
2
-
3
- EVAL_DATA_DIR=eval
4
- OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
- CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
-
8
- gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
9
- IFS=',' read -ra GPULIST <<< "$gpu_list"
10
-
11
- # divide data via the number of GPUs per task
12
- GPUS_PER_TASK=1
13
- CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
-
15
- output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/merge.json
16
-
17
- # if output_file not exists then inference
18
- if [ ! -f "$output_file" ]; then
19
- for IDX in $(seq 0 $((CHUNKS-1))); do
20
- # select the GPUs for the task
21
- gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
22
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_consistency.py \
23
- --model-path ${CKPT} \
24
- --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
25
- --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/consistency_qa.json \
26
- --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
27
- --num-chunks $CHUNKS \
28
- --chunk-idx $IDX &
29
- done
30
-
31
- wait
32
-
33
- # Clear out the output file if it exists.
34
- > "$output_file"
35
-
36
- #Loop through the indices and concatenate each file.
37
- for IDX in $(seq 0 $((CHUNKS-1))); do
38
- cat ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
39
- done
40
- fi
41
-
42
-
43
- AZURE_API_KEY=your_key
44
- AZURE_API_ENDPOINT=your_endpoint
45
- AZURE_API_DEPLOYNAME=your_deployname
46
-
47
- python3 videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py \
48
- --pred-path ${output_file} \
49
- --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/gpt \
50
- --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/results.json \
51
- --api-key $AZURE_API_KEY \
52
- --api-endpoint $AZURE_API_ENDPOINT \
53
- --api-deployname $AZURE_API_DEPLOYNAME \
54
- --num-tasks 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/vllava/finetune.sh DELETED
@@ -1,73 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=${3:-0}
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2_vllava
32
- RUN_NAME=siglip_tcv35_7b_16f
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train.py \
42
- --deepspeed scripts/zero3.json \
43
- --model_type videollama2_qwen2 \
44
- --model_path Qwen/Qwen2-7B-Instruct \
45
- --vision_tower google/siglip-so400m-patch14-384 \
46
- --mm_projector_type stc_connector_v35 \
47
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
48
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
- --data_folder ${DATA_DIR}/videollava_sft/ \
50
- --mm_vision_select_layer -2 \
51
- --image_aspect_ratio pad \
52
- --num_frames 16 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --save_strategy "steps" \
62
- --save_steps 500 \
63
- --save_total_limit 99 \
64
- --learning_rate 2e-5 \
65
- --weight_decay 0. \
66
- --warmup_ratio 0.03 \
67
- --lr_scheduler_type "cosine" \
68
- --logging_steps 1 \
69
- --model_max_length 2048 \
70
- --gradient_checkpointing True \
71
- --dataloader_num_workers 4 \
72
- --report_to tensorboard \
73
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/VideoLLaMA2/scripts/vllava/pretrain.sh DELETED
@@ -1,73 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=${3:-0}
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=8
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2_vllava
32
- RUN_NAME=siglip_tcv35_7b_16f
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train.py \
42
- --deepspeed scripts/zero3.json \
43
- --model_type videollama2_qwen2 \
44
- --model_path Qwen/Qwen2-7B-Instruct \
45
- --vision_tower google/siglip-so400m-patch14-384 \
46
- --mm_projector_type stc_connector_v35 \
47
- --tune_mm_mlp_adapter True \
48
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
49
- --data_folder ${DATA_DIR}/videollava_pt/ \
50
- --mm_vision_select_layer -2 \
51
- --num_frames 16 \
52
- --bf16 True \
53
- --tf32 True \
54
- --fp16 False \
55
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
56
- --num_train_epochs 1 \
57
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
58
- --per_device_eval_batch_size 4 \
59
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
60
- --evaluation_strategy "no" \
61
- --save_strategy "steps" \
62
- --save_steps 500 \
63
- --save_total_limit 99 \
64
- --learning_rate 1e-3 \
65
- --weight_decay 0. \
66
- --warmup_ratio 0.03 \
67
- --lr_scheduler_type "cosine" \
68
- --logging_steps 1 \
69
- --model_max_length 2048 \
70
- --gradient_checkpointing True \
71
- --dataloader_num_workers 4 \
72
- --report_to tensorboard \
73
- --run_name $RUN_NAME \