Spaces:

lym0302
/

DeepSound-V1

Running

App Files Files Community

lym0302123 commited on Mar 25

Commit

dcf3642

1 Parent(s): bfceb04

rm

Browse files

Files changed (30) hide show

third_party/MMAudio/mmaudio/ext/rotary_embeddings.py +3 -3
third_party/VideoLLaMA2/README.md +0 -365
third_party/VideoLLaMA2/pyproject.toml +0 -41
third_party/VideoLLaMA2/requirements.txt +0 -42
third_party/VideoLLaMA2/scripts/custom/finetune.sh +0 -73
third_party/VideoLLaMA2/scripts/custom/finetune_audio.sh +0 -72
third_party/VideoLLaMA2/scripts/custom/finetune_lora.sh +0 -74
third_party/VideoLLaMA2/scripts/custom/finetune_qlora.sh +0 -74
third_party/VideoLLaMA2/scripts/custom/pretrain_audio.sh +0 -70
third_party/VideoLLaMA2/scripts/custom/va_joint.sh +0 -80
third_party/VideoLLaMA2/scripts/eval/eval_audio_TUT2017.sh +0 -44
third_party/VideoLLaMA2/scripts/eval/eval_audio_clothoAQA.sh +0 -45
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVQA.sh +0 -44
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSD.sh +0 -47
third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSSD.sh +0 -47
third_party/VideoLLaMA2/scripts/eval/eval_audio_vocalsound.sh +0 -44
third_party/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh +0 -67
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh +0 -41
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh +0 -46
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh +0 -45
third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh +0 -84
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_activitynet.sh +0 -54
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_msvd.sh +0 -54
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh +0 -58
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh +0 -58
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh +0 -58
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh +0 -54
third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh +0 -54
third_party/VideoLLaMA2/scripts/vllava/finetune.sh +0 -73
third_party/VideoLLaMA2/scripts/vllava/pretrain.sh +0 -73

third_party/MMAudio/mmaudio/ext/rotary_embeddings.py CHANGED Viewed

@@ -7,7 +7,7 @@ from torch import Tensor
 # Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
 # Ref: https://github.com/lucidrains/rotary-embedding-torch
 def compute_rope_rotations(length: int,
                            dim: int,
                            theta: int,
@@ -16,7 +16,7 @@ def compute_rope_rotations(length: int,
                            device: Union[torch.device, str] = 'cpu') -> Tensor:
     assert dim % 2 == 0
-    with torch.amp.autocast(device_type='cuda', enabled=False):
         pos = torch.arange(length, dtype=torch.float32, device=device)
         freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
         freqs *= freq_scaling
@@ -28,7 +28,7 @@ def compute_rope_rotations(length: int,
 def apply_rope(x: Tensor, rot: Tensor) -> Tuple[Tensor, Tensor]:
-    with torch.amp.autocast(device_type='cuda', enabled=False):
         _x = x.float()
         _x = _x.view(*_x.shape[:-1], -1, 1, 2)
         x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]

 # Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
 # Ref: https://github.com/lucidrains/rotary-embedding-torch
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 def compute_rope_rotations(length: int,
                            dim: int,
                            theta: int,
                            device: Union[torch.device, str] = 'cpu') -> Tensor:
     assert dim % 2 == 0
+    with torch.amp.autocast(device_type=DEVICE, enabled=False):
         pos = torch.arange(length, dtype=torch.float32, device=device)
         freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
         freqs *= freq_scaling
 def apply_rope(x: Tensor, rot: Tensor) -> Tuple[Tensor, Tensor]:
+    with torch.amp.autocast(device_type=DEVICE, enabled=False):
         _x = x.float()
         _x = _x.view(*_x.shape[:-1], -1, 1, 2)
         x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]

third_party/VideoLLaMA2/README.md DELETED Viewed

@@ -1,365 +0,0 @@
-<p align="center">
-    <img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/e7bc34e0e9a96d77947a75b54399d9f96ccf209d/assets/logo.png" width="150" style="margin-bottom: 0.2;"/>
-<p>
-<h3 align="center"><a href="https://arxiv.org/abs/2406.07476" style="color:#9C276A">
-VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</a></h3>
-<h5 align="center"> If our project helps you, please give us a star ⭐ on GitHub to support us. 🙏🙏 </h2>
-<h5 align="center">
-[![hf_space](https://img.shields.io/badge/🤗-Demo-9C276A.svg)](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
-[![hf_checkpoint](https://img.shields.io/badge/🤗-Checkpoints-9C276A.svg)](https://huggingface.co/collections/DAMO-NLP-SG/videollama-2-6669b6b6f0493188305c87ed)
-[![hf_data](https://img.shields.io/badge/🤗-MSVC-9C276A.svg)](https://huggingface.co/datasets/DAMO-NLP-SG/Multi-Source-Video-Captioning)
-[![arXiv](https://img.shields.io/badge/Arxiv-2406.07476-AD1C18.svg?logo=arXiv)](https://arxiv.org/abs/2406.07476) <br>
-[![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/main/LICENSE)
-[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideoLLaMA2&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitor&edge_flat=false)](https://hits.seeyoufarm.com)
-[![GitHub issues](https://img.shields.io/github/issues/DAMO-NLP-SG/VideoLLaMA2?color=critical&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aopen+is%3Aissue)
-[![GitHub closed issues](https://img.shields.io/github/issues-closed/DAMO-NLP-SG/VideoLLaMA2?color=success&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aissue+is%3Aclosed)  <br>
-</h5>
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-egoschema-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-egoschema-1?p=videollama-2-advancing-spatial-temporal) <br>
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-perception-test)](https://paperswithcode.com/sota/video-question-answering-on-perception-test?p=videollama-2-advancing-spatial-temporal) <br>
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-mvbench)](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=videollama-2-advancing-spatial-temporal) <br>
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme-1?p=videollama-2-advancing-spatial-temporal) <br>
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme?p=videollama-2-advancing-spatial-temporal) <br>
-<details open><summary>💡 Some other multimodal-LLM projects from our team may interest you ✨. </summary><p>
-<!--  may -->
-> [**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**](https://github.com/DAMO-NLP-SG/Video-LLaMA) <br>
-> Hang Zhang, Xin Li, Lidong Bing <br>
-[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/Video-LLaMA)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/Video-LLaMA.svg?style=social)](https://github.com/DAMO-NLP-SG/Video-LLaMA) [![arXiv](https://img.shields.io/badge/Arxiv-2306.02858-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2306.02858) <br>
-> [**VCD: Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding**](https://arxiv.org/abs/2311.16922) <br>
-> Sicong Leng, Hang Zhang, Guanzheng Chen, Xin Li, Shijian Lu, Chunyan Miao, Lidong Bing <br>
-[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/VCD)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/VCD.svg?style=social)](https://github.com/DAMO-NLP-SG/VCD)  [![arXiv](https://img.shields.io/badge/Arxiv-2311.16922-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16922) <br>
-> [**The Curse of Multi-Modalities: Evaluating Hallucinations of Large Multimodal Models across Language, Visual, and Audio**](https://arxiv.org/abs/2410.12787) <br>
-> Sicong Leng, Yun Xing, Zesen Cheng, Yang Zhou, Hang Zhang, Xin Li, Deli Zhao, Shijian Lu, Chunyan Miao, Lidong Bing <br>
-[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/CMM)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/CMM.svg?style=social)](https://github.com/DAMO-NLP-SG/CMM)  [![arXiv](https://img.shields.io/badge/Arxiv-2410.12787-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.12787) <br>
-</p></details>
-<div align="center"><video src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/e0e7951c-f392-42ed-afad-b2c7984d3e38" width="800"></div>
-## 📰 News
-* **[2024.10.22]**  Release checkpoints of [VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV).
-* **[2024.10.15]**  Release checkpoints of [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) and [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F).
-* **[2024.08.14]**  Release checkpoints of [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base) and [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B).
-* **[2024.07.30]**  Release checkpoints of [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) and [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B).
-* **[2024.06.25]**  🔥🔥 As of Jun 25, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [MLVU Leaderboard](https://github.com/JUNJIE99/MLVU?tab=readme-ov-file#trophy-mini-leaderboard).
-* **[2024.06.18]**  🔥🔥 As of Jun 18, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [VideoMME Leaderboard](https://video-mme.github.io/home_page.html#leaderboard).
-* **[2024.06.17]**  👋👋 Update technical report with the latest results and the missing references. If you have works closely related to VideoLLaMA 2 but not mentioned in the paper, feel free to let us know.
-* **[2024.06.14]**  🔥🔥 [Online Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2) is available.
-* **[2024.06.03]**  Release training, evaluation, and serving codes of VideoLLaMA 2.
-<img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/b9faf24f-bdd2-4728-9385-acea17ea086d" width="800" />
-## 🛠️ Requirements and Installation
-Basic Dependencies:
-* Python >= 3.8
-* Pytorch >= 2.2.0
-* CUDA Version >= 11.8
-* transformers == 4.40.0 (for reproducing paper results)
-* tokenizers == 0.19.1
-**[Online Mode]** Install required packages (better for development):
-```bash
-git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
-cd VideoLLaMA2
-git checkout audio_visual
-pip install -r requirements.txt
-pip install flash-attn==2.5.8 --no-build-isolation
-pip install opencv-python==4.5.5.64
-apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
-```
-**[Offline Mode]** Install VideoLLaMA2 as a Python package (better for direct use):
-```bash
-git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
-cd VideoLLaMA2
-git checkout audio_visual
-pip install --upgrade pip  # enable PEP 660 support
-pip install -e .
-pip install flash-attn==2.5.8 --no-build-isolation
-pip install opencv-python==4.5.5.64
-apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
-```
-## 🚀 Main Results
-### Multi-Choice Video QA & Video Captioning
-<p><img src="https://github.com/user-attachments/assets/e87fe4cf-07ea-4fde-998b-a0c63671c3b4" width="800" "/></p>
-###  Open-Ended Video QA
-<p><img src="https://github.com/user-attachments/assets/80b16c04-75ac-43b8-bc22-6952fdf994bb" width="800" "/></p>
-### Audio QA
-<p><img src="https://github.com/user-attachments/assets/46e55952-5a54-4564-bcd4-cfa4edd7f36a" width="800" "/></p>
-### Audio-Visual QA
-<p><img src="https://github.com/user-attachments/assets/8114c1e3-7f93-401b-9ea6-9ce7c96d7b05" width="800" "/></p>
-## :earth_americas: Model Zoo
-### Vision-only Checkpoints
-| Model Name     | Model Type | Visual Encoder | Language Decoder | # Training Frames |
-|:----------------|:------------:|:----------------|:------------------|:----------------:|
-| [VideoLLaMA2-7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-Base)  | Base  | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 8 |
-| [VideoLLaMA2-7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 8 |
-| [VideoLLaMA2-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F-Base)  | Base  | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 16 |
-| [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 16 |
-| [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base)  | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)  | 8 |
-| [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)  | 8 |
-| [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base)  | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)  | 8 |
-| [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)  | 8 |
-| [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) | Base | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)  | 16 |
-| [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)  | Chat | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)  | 16 |
-### Audio-Visual Checkpoints
-| Model Name     | Type | Audio Encoder | Language Decoder |
-|:-------------------|:----------------|:----------------|:------------------|
-| [VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV)  | Chat | [Fine-tuned BEATs_iter3+(AS2M)(cpt2)](https://1drv.ms/u/s!AqeByhGUtINrgcpj8ujXH1YUtxooEg?e=E9Ncea) | [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)  |
-## [🤗 Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2-AV)
-It is highly recommended to try our [online demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2-AV) first.
-To run a video-based LLM (Large Language Model) web demonstration on your device, you will first need to ensure that you have the necessary model checkpoints prepared, followed by adhering to the steps outlined to successfully launch the demo.
-### Single-model Version
-* Launch a gradio app directly ([VideoLLaMA2.1-7B-AV](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-AV) is adopted by default):
-```bash
-python videollama2/serve/gradio_web_server_adhoc_av.py
-```
-## 🗝️ Training & Evaluation
-### Quick Start
-To facilitate further development on top of our codebase, we provide a quick-start guide on how to train a customized [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2) with [VideoLLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) dataset and evaluate the trained model on the mainstream video-llm benchmarks.
-1. Training Data Structure:
-Follow the main branch(https://github.com/DAMO-NLP-SG/VideoLLaMA2/tree/main) of this VideoLLaMA2 codebase.
-2. Command:
-```bash
-# VideoLLaMA2.1-audio pretraining
-bash scripts/custom/pretrain_audio.sh
-# VideoLLaMA2.1-audio finetuning
-bash scripts/custom/finetune_audio.sh
-# VideoLLaMA2.1-audio_visual finetuning
-bash scripts/custom/va_joint.sh
-```
-3. Evaluation Data Structure:
-Follow the main branch(https://github.com/DAMO-NLP-SG/VideoLLaMA2/tree/main) of this VideoLLaMA2 codebase.
-4. Command:
-```bash
-# ClothoAQA.sh evaluation
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_clothoAQA.sh
-# TUT2017 evaluation
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_TUT2017.sh
-# VocalSound evaluation
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_vocalsound.sh
-# AVQA_music evaluation
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVQA.sh
-# AVSD evaluation (need to set azure openai key/endpoint/deployname)
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVSD.sh
-# AVSSD evaluation (need to set azure openai key/endpoint/deployname)
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_audio_video_AVSSD.sh
-```
-### Data Format
-If you want to train a video-llm on your data, you need to follow the procedures below to prepare the audio/video/image sft data:
-1. Suppose your data structure is like:
-```bash
-VideoLLaMA2
-├── datasets
-│   ├── custom_sft
-│   |   ├── audio
-│   |   ├── video
-│   |   ├── image
-|   |   └── custom.json
-```
-2. Then you should re-organize the annotated audio/video/image sft data according to the following format:
-```json
-[
-    {
-        "id": 0,
-        "audio": "audio/xxx.wav",
-        "conversations": [
-            {
-                "from": "human",
-                "value": "<audio>\nPlease describe the sound event within the audio."
-            },
-            {
-                "from": "gpt",
-                "value": "Loud television static dips in and out of focus."
-            },
-            ...
-        ],
-    }
-    {
-        "id": 1,
-        "video": "images/xxx.jpg",
-        "conversations": [
-            {
-                "from": "human",
-                "value": "<image>\nWhat are the colors of the bus in the image?"
-            },
-            {
-                "from": "gpt",
-                "value": "The bus in the image is white and red."
-            },
-            ...
-        ],
-    }
-    {
-        "id": 2,
-        "video": "videos/xxx.mp4",
-        "conversations": [
-            {
-                "from": "human",
-                "value": "<video>\nWhat are the main activities that take place in the video?"
-            },
-            {
-                "from": "gpt",
-                "value": "The main activities that take place in the video are the preparation of camera equipment by a man, a group of men riding a helicopter, and a man sailing a boat through the water."
-            },
-            ...
-        ],
-    },
-    ...
-]
-```
-3. Modify the `scripts/custom/finetune_audio.sh`:
-```bash
-...
---data_path datasets/custom_sft/custom.json
---data_folder datasets/custom_sft/
---pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)
-...
-```
-4. Modify the `scripts/custom/va_joint.sh`:
-```bash
-...
---data_path datasets/custom_sft/custom.json
---data_folder datasets/custom_sft/
---pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)
-...
-```
-## 🤖 Inference
-Audio/Video-Audio Inference:
-```python
-import sys
-sys.path.append('./')
-from videollama2 import model_init, mm_infer
-from videollama2.utils import disable_torch_init
-import argparse
-def inference(args):
-    model_path = args.model_path
-    model, processor, tokenizer = model_init(model_path)
-    if args.modal_type == "a":
-        model.model.vision_tower = None
-    elif args.modal_type == "v":
-        model.model.audio_tower = None
-    elif args.modal_type == "av":
-        pass
-    else:
-        raise NotImplementedError
-    # Audio-visual Inference
-    audio_video_path = "assets/00000368.mp4"
-    preprocess = processor['audio' if args.modal_type == "a" else "video"]
-    if args.modal_type == "a":
-        audio_video_tensor = preprocess(audio_video_path)
-    else:
-        audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
-    question = f"Who plays the instrument louder?"
-    # Audio Inference
-    audio_video_path = "assets/bird-twitter-car.wav"
-    preprocess = processor['audio' if args.modal_type == "a" else "video"]
-    if args.modal_type == "a":
-        audio_video_tensor = preprocess(audio_video_path)
-    else:
-        audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
-    question = f"Please describe the audio:"
-    # Video Inference
-    audio_video_path = "assets/output_v_1jgsRbGzCls.mp4"
-    preprocess = processor['audio' if args.modal_type == "a" else "video"]
-    if args.modal_type == "a":
-        audio_video_tensor = preprocess(audio_video_path)
-    else:
-        audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
-    question = f"What activity are the people practicing in the video?"
-    output = mm_infer(
-        audio_video_tensor,
-        question,
-        model=model,
-        tokenizer=tokenizer,
-        modal='audio' if args.modal_type == "a" else "video",
-        do_sample=False,
-    )
-    print(output)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model-path', help='', required=True)
-    parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
-    args = parser.parse_args()
-    inference(args)
-```
-## 📑 Citation
-If you find VideoLLaMA useful for your research and applications, please cite using this BibTeX:
-```bibtex
-@article{damonlpsg2024videollama2,
-  title={VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs},
-  author={Cheng, Zesen and Leng, Sicong and Zhang, Hang and Xin, Yifei and Li, Xin and Chen, Guanzheng and Zhu, Yongxin and Zhang, Wenqi and Luo, Ziyang and Zhao, Deli and Bing, Lidong},
-  journal={arXiv preprint arXiv:2406.07476},
-  year={2024},
-  url = {https://arxiv.org/abs/2406.07476}
-}
-@article{damonlpsg2023videollama,
-  title = {Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding},
-  author = {Zhang, Hang and Li, Xin and Bing, Lidong},
-  journal = {arXiv preprint arXiv:2306.02858},
-  year = {2023},
-  url = {https://arxiv.org/abs/2306.02858}
-}
-```
-## 👍 Acknowledgement
-The codebase of VideoLLaMA 2 is adapted from [**LLaVA 1.5**](https:github.com/haotian-liu/LLaVA) and [**FastChat**](https://github.com/lm-sys/FastChat). We are also grateful for the following projects our VideoLLaMA 2 arise from:
-* [**LLaMA 2**](https://github.com/meta-llama/llama), [**Mistral-7B**](https://mistral.ai/news/announcing-mistral-7b/), [**OpenAI CLIP**](https://openai.com/index/clip/), [**Honeybee**](https://github.com/kakaobrain/honeybee).
-* [**Video-ChatGPT**](https://github.com/mbzuai-oryx/Video-ChatGPT), [**Video-LLaVA**](https://github.com/PKU-YuanGroup/Video-LLaVA).
-* [**WebVid**](https://github.com/m-bain/webvid), [**Panda-70M**](https://github.com/snap-research/Panda-70M), [**LanguageBind**](https://github.com/PKU-YuanGroup/LanguageBind), [**InternVid**](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid).
-* [**VideoChat2**](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2), [**Valley**](https://github.com/RupertLuo/Valley), [**VTimeLLM**](https://github.com/huangb23/VTimeLLM), [**ShareGPT4V**](https://sharegpt4v.github.io/).
-## 🔒 License
-This project is released under the Apache 2.0 license as found in the LICENSE file.
-The service is a research preview intended for **non-commercial use ONLY**, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.

third_party/VideoLLaMA2/pyproject.toml DELETED Viewed

@@ -1,41 +0,0 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "videollama2"
-version = "1.0"
-description = "Release of VideoLLaMA2"
-readme = "README.md"
-requires-python = ">=3.8"
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: Apache Software License",
-]
-dependencies = [
-    "torch==2.2.0", "torchvision==0.17.0", "torchaudio==2.2.0", "librosa",
-    "transformers==4.42.3", "tokenizers==0.19.1",
-    "deepspeed==0.13.1", "accelerate==0.26.1",
-    "peft==0.4.0", "timm==1.0.3", "numpy==1.24.4",
-    "decord==0.6.0", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
-    "moviepy==1.0.3", "scenedetect==0.6.3",
-    "opencv-python==4.6.0.66", "pysubs2",
-    "scikit-learn==1.2.2", "huggingface_hub==0.23.4", "sentencepiece==0.1.99",
-    "shortuuid", "einops==0.6.1", "einops-exts==0.0.4",
-    "bitsandbytes==0.43.0", "pydantic>=2.0", "markdown2[all]",
-    "gradio==3.50.0", "gradio_client==0.6.1", "httpx==0.24.1",
-    "requests", "openai", "uvicorn", "fastapi", "tensorboard", "wandb", "tabulate"
-]
-[project.optional-dependencies]
-train = ["ninja"]
-[project.urls]
-"Homepage" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2"
-"Bug Tracker" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues"
-[tool.setuptools.packages.find]
-exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
-[tool.wheel]
-exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]

third_party/VideoLLaMA2/requirements.txt DELETED Viewed

@@ -1,42 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cu118
-# basic dependencies
-torch==2.2.0
-torchaudio==2.2.0
-torchvision==0.17.0
-transformers==4.42.3
-tokenizers==0.19.1
-deepspeed==0.13.1
-accelerate==0.26.1
-peft==0.4.0
-timm==1.0.3
-numpy==1.24.4
-# data processing
-decord==0.6.0
-imageio==2.34.0
-imageio-ffmpeg==0.4.9
-moviepy==1.0.3
-scenedetect==0.6.3
-opencv-python==4.6.0.66
-pysubs2
-librosa
-pytorchvideo
-# misc
-scikit-learn==1.2.2
-huggingface_hub==0.23.4
-sentencepiece==0.1.99
-shortuuid
-einops==0.6.1
-einops-exts==0.0.4
-bitsandbytes==0.43.0
-pydantic>=2.0
-markdown2[all]
-gradio==3.50.0
-gradio_client==0.6.1
-httpx==0.24.1
-openai==1.33.0
-requests
-uvicorn
-fastapi
-tensorboard
-wandb
-tabulate

third_party/VideoLLaMA2/scripts/custom/finetune.sh DELETED Viewed

@@ -1,73 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=${3:-0}
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_downstream_sft
-RUN_NAME=siglip_tcv35_7b_16f
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero3.json \
-    --model_type videollama2_qwen2 \
-    --model_path Qwen/Qwen2-7B-Instruct \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --mm_projector_type stc_connector_v35 \
-    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
-    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
-    --data_folder ${DATA_DIR}/videollava_sft/ \
-    --mm_vision_select_layer -2 \
-    --image_aspect_ratio pad \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --save_strategy "steps" \
-    --save_steps 500 \
-    --save_total_limit 99 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/custom/finetune_audio.sh DELETED Viewed

@@ -1,72 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=0
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=audio_stage2_qwen2
-RUN_NAME=audio_stage2_qwen2
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE  \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero2.json \
-    --model_type videollama2_qwen2 \
-    --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
-    --data_path_a ${DATA_DIR}/stage2_audio_text.json \
-    --audio_tower ./BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
-    --pretrain_mm_mlp_adapter_a $OUTP_DIR/mm_projector_a.bin \
-    --mm_projector_a_type mlp2x_gelu \
-    --tune_mm_mlp_adapter_a True \
-    --tune_audio_tower True \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
-    --num_train_epochs 2 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 2000 \
-    --save_total_limit 2 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --lazy_preprocess True \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/custom/finetune_lora.sh DELETED Viewed

@@ -1,74 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=${3:-0}
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_downstream_sft
-RUN_NAME=siglip_tcv35_7b_16f_lora
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
-    --deepspeed scripts/zero3.json \
-    --model_type videollama2_qwen2 \
-    --model_path Qwen/Qwen2-7B-Instruct \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --mm_projector_type stc_connector_v35 \
-    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
-    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
-    --data_folder ${DATA_DIR}/videollava_sft/ \
-    --mm_vision_select_layer -2 \
-    --image_aspect_ratio pad \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --save_strategy "steps" \
-    --save_steps 500 \
-    --save_total_limit 99 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/custom/finetune_qlora.sh DELETED Viewed

@@ -1,74 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=${3:-0}
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_downstream_sft
-RUN_NAME=siglip_tcv35_7b_16f_qlora
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --bits 4 \
-    --deepspeed scripts/zero2.json \
-    --model_type videollama2_qwen2 \
-    --model_path Qwen/Qwen2-7B-Instruct \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --mm_projector_type stc_connector_v35 \
-    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
-    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
-    --data_folder ${DATA_DIR}/videollava_sft/ \
-    --mm_vision_select_layer -2 \
-    --image_aspect_ratio pad \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --save_strategy "steps" \
-    --save_steps 500 \
-    --save_total_limit 99 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/custom/pretrain_audio.sh DELETED Viewed

@@ -1,70 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=0
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=1024
-LOCAL_BATCH_SIZE=32
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_audio_stage1
-RUN_NAME=videollama2qwen2_audio_stage1
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE  \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero2.json \
-    --model_type videollama2_qwen2 \
-    --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
-    --data_path_a ${DATA_DIR}/stage1_pretrain.json \
-    --audio_tower ./BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
-    --mm_projector_a_type mlp2x_gelu \
-    --tune_mm_mlp_adapter_a True \
-    --mm_vision_select_layer -1 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 1000 \
-    --save_total_limit 1 \
-    --learning_rate 1e-3 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --lazy_preprocess True \
-    --report_to tensorboard \
-    --run_name pretrain_$RUN_NAME \

third_party/VideoLLaMA2/scripts/custom/va_joint.sh DELETED Viewed

@@ -1,80 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=0
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=audio_visual_stage3_qwen2
-RUN_NAME=audio_visual_stage3_qwen2
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE  \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero2.json \
-    --model_type videollama2_qwen2 \
-    --model_path DAMO-NLP-SG/VideoLLaMA2.1-7B-16F \
-    --data_folder ${DATA_DIR} \
-    --data_path ${DATA_DIR}/stage3_video_audio.json,${DATA_DIR}/stage2_audio_subset_new.json,${DATA_DIR}/stage2_video_subset.json \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --audio_tower $OUTP_DIR/audio_tower.bin \
-    --pretrain_mm_mlp_adapter_a $OUTP_DIR/mm_projector_a.bin \
-    --mm_projector_type stc_connector_v35 \
-    --mm_projector_a_type mlp2x_gelu \
-    --va True \
-    --tune_audio_tower True \
-    --tune_adapter_llm True \
-    --tune_mm_mlp_adapter_a True \
-    --mm_vision_select_layer -2 \
-    --image_aspect_ratio pad \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir $OUTP_DIR/${WANDB_PROJECT}/VideoLLaMA2.1-7B-AV \
-    --num_train_epochs 2 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 2000 \
-    --save_total_limit 2 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --lazy_preprocess True \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/eval/eval_audio_TUT2017.sh DELETED Viewed

@@ -1,44 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
-            --model-path ${CKPT} \
-            --dataset TUT2017 \
-            --video-folder ${EVAL_DATA_DIR}/TUT2017 \
-            --question-file ${EVAL_DATA_DIR}/TUT2017/tut2017_eval.jsonl \
-            --answer-file ${EVAL_DATA_DIR}/TUT2017/tut2017_eval.jsonl \
-            --output-file ${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/TUT2017/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python videollama2/eval/eval_audio_TUT2017.py \
-    --pred-path ${output_file}

third_party/VideoLLaMA2/scripts/eval/eval_audio_clothoAQA.sh DELETED Viewed

@@ -1,45 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
-            --model-path ${CKPT} \
-            --dataset clothoAQA \
-            --video-folder ${EVAL_DATA_DIR}/ClothoAQA/audio_files \
-            --question-file ${EVAL_DATA_DIR}/clothoAQA_eval.json \
-            --answer-file ${EVAL_DATA_DIR}/clothoAQA_eval.json \
-            --output-file ${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/clothoAQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python videollama2/eval/eval_audio_clothoAQA.py \
-    --pred-path ${output_file}

third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVQA.sh DELETED Viewed

@@ -1,44 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
-            --model-path ${CKPT} \
-            --dataset AVQA \
-            --video-folder ${EVAL_DATA_DIR}/AVQA_music/MUSIC-AVQA-videos \
-            --question-file ${EVAL_DATA_DIR}/AVQA_music/AVQA_music_test.json \
-            --answer-file ${EVAL_DATA_DIR}/AVQA_music/AVQA_music_test.json \
-            --output-file ${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/AVQA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python3 videollama2/eval/eval_audio_video_AVQA.py \
-    --pred-path ${output_file}

third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSD.sh DELETED Viewed

@@ -1,47 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
-            --model-path ${CKPT} \
-            --dataset AVSD \
-            --video-folder ${EVAL_DATA_DIR}/AVSD/Charades_v1_480 \
-            --question-file ${EVAL_DATA_DIR}/AVSD/instruction_val.json \
-            --answer-file ${EVAL_DATA_DIR}/AVSD/instruction_val.json \
-            --output-file ${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/AVSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python videollama2/eval/eval_audio_video_AVSD.py \
-    --pred-path /mnt/data/xyf/VideoLLaMA2_backup/eval_output/AVSD/answers/vlb_audio_visual_stage3_tuning_projector_beats_qwen2_videollm_ep2/merge.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME

third_party/VideoLLaMA2/scripts/eval/eval_audio_video_AVSSD.sh DELETED Viewed

@@ -1,47 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio_video.py \
-            --model-path ${CKPT} \
-            --dataset AVSSD \
-            --video-folder ${EVAL_DATA_DIR}/VGGSound_final/video \
-            --question-file ${EVAL_DATA_DIR}/avssd_test.json \
-            --answer-file ${EVAL_DATA_DIR}/avssd_test.json \
-            --output-file ${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/AVSSD/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python videollama2/eval/eval_audio_video_AVSSD.py \
-    --pred-path ${output_file} \
-    --api-key f68a11a54a064caa851e290258d52cce \
-    --api-endpoint https://vl-australiaeast.openai.azure.com/ \
-    --api-deployname gpt35-turbo-0613

third_party/VideoLLaMA2/scripts/eval/eval_audio_vocalsound.sh DELETED Viewed

@@ -1,44 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-AV
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_audio.py \
-            --model-path ${CKPT} \
-            --dataset vocalsound \
-            --video-folder ${EVAL_DATA_DIR}/vocal/audio_16k \
-            --question-file ${EVAL_DATA_DIR}/vocal/vocalsound_eval.jsonl \
-            --answer-file ${EVAL_DATA_DIR}/vocal/vocalsound_eval.jsonl \
-            --output-file ${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/vocalsound/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python videollama2/eval/eval_audio_vocalsound.py \
-    --pred-path ${output_file}

third_party/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh DELETED Viewed

@@ -1,67 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/merge.json
-# judge if the number of json lines is 0
-if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
-    rm -f ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/*.json
-fi
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_cap_msvc.py \
-          --model-path ${CKPT} \
-          --video-folder ${EVAL_DATA_DIR}/msvc \
-          --question-file ${EVAL_DATA_DIR}/msvc/msvc.json \
-          --output-file ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-          --num-chunks $CHUNKS \
-          --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_cap_msvc_correctness.py \
-    --pred-path $output_file \
-    --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_gpt \
-    --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4 \
-python3 videollama2/eval/eval_video_cap_msvc_detailedness.py \
-    --pred-path $output_file \
-    --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_gpt \
-    --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4 \

third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh DELETED Viewed

@@ -1,41 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/merge.csv
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_egoschema.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/egoschema/good_clips_git \
-            --question-file ${EVAL_DATA_DIR}/egoschema/questions.json \
-            --answer-file ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    echo 'q_uid, answer' >> "$output_file"
-    # Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv >> "$output_file"
-    done
-fi

third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh DELETED Viewed

@@ -1,46 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/merge.json
-# judge if the number of json lines is 0
-if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
-    rm -f ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/*.json
-fi
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_mvbench.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/mvbench/video \
-            --question-file ${EVAL_DATA_DIR}/mvbench/json \
-            --answer-file ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    # Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-python3 videollama2/eval/eval_video_mcqa_mvbench.py \
-    --pred_path ${output_file} \

third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh DELETED Viewed

@@ -1,45 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_perception_test_mcqa.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/perception_test_mcqa/videos \
-            --question-file ${EVAL_DATA_DIR}/perception_test_mcqa/mc_question_test.json \
-            --answer-file ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    echo "{" >> "$output_file"
-    # Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-    sed -i '$s/.$//' $output_file
-    echo "}" >> "$output_file"
-fi

third_party/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh DELETED Viewed

@@ -1,84 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge.json
-output_sub_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge_sub.json
-# judge if the number of json lines is 0
-if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
-    rm -f ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/*.json
-fi
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_videomme.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videomme/videos \
-            --subtitle-folder ${EVAL_DATA_DIR}/videomme/subtitles \
-            --question-file ${EVAL_DATA_DIR}/videomme/test-00000-of-00001.parquet \
-            --answer-file ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    echo "[" >> "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-    sed -i '$s/.$//' $output_file
-    echo "]" >> "$output_file"
-    # Clear out the output file if it exists.
-    > "$output_sub_file"
-    echo "[" >> "$output_sub_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}_sub.json >> "$output_sub_file"
-    done
-    sed -i '$s/.$//' $output_sub_file
-    echo "]" >> "$output_sub_file"
-fi
-python videollama2/eval/eval_video_mcqa_videomme.py \
-    --results_file $output_file \
-    --video_duration_type "short,medium,long" \
-    --return_categories_accuracy \
-    --return_sub_categories_accuracy \
-    --return_task_types_accuracy \
-    --skip_missing \
-python videollama2/eval/eval_video_mcqa_videomme.py \
-    --results_file $output_sub_file \
-    --video_duration_type "short,medium,long" \
-    --return_categories_accuracy \
-    --return_sub_categories_accuracy \
-    --return_task_types_accuracy \
-    --skip_missing \

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_activitynet.sh DELETED Viewed

@@ -1,54 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/all_test \
-            --question-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_q.json \
-            --answer-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_a.json \
-            --output-file ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_activitynet.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_msvd.sh DELETED Viewed

@@ -1,54 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/videos \
-            --question-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_q.json \
-            --answer-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_a.json \
-            --output-file ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_activitynet.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh DELETED Viewed

@@ -1,58 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
-            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
-            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh DELETED Viewed

@@ -1,58 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
-            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
-            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh DELETED Viewed

@@ -1,58 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
-            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
-            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
-    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
-    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_vcgpt_3_context.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh DELETED Viewed

@@ -1,54 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/merge.json
-# if output_file not exists then inference
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
-            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/temporal_qa.json \
-            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh DELETED Viewed

@@ -1,54 +0,0 @@
-set -x
-EVAL_DATA_DIR=eval
-OUTPUT_DIR=eval_output
-CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
-CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
-gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
-IFS=',' read -ra GPULIST <<< "$gpu_list"
-# divide data via the number of GPUs per task
-GPUS_PER_TASK=1
-CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
-output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/merge.json
-# if output_file not exists then inference
-if [ ! -f "$output_file" ]; then
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        # select the GPUs for the task
-        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
-        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_consistency.py \
-            --model-path ${CKPT} \
-            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
-            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/consistency_qa.json \
-            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
-            --num-chunks $CHUNKS \
-            --chunk-idx $IDX &
-    done
-    wait
-    # Clear out the output file if it exists.
-    > "$output_file"
-    #Loop through the indices and concatenate each file.
-    for IDX in $(seq 0 $((CHUNKS-1))); do
-        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
-    done
-fi
-AZURE_API_KEY=your_key
-AZURE_API_ENDPOINT=your_endpoint
-AZURE_API_DEPLOYNAME=your_deployname
-python3 videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py \
-    --pred-path ${output_file} \
-    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/gpt \
-    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/results.json \
-    --api-key $AZURE_API_KEY \
-    --api-endpoint $AZURE_API_ENDPOINT \
-    --api-deployname $AZURE_API_DEPLOYNAME \
-    --num-tasks 4

third_party/VideoLLaMA2/scripts/vllava/finetune.sh DELETED Viewed

@@ -1,73 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=${3:-0}
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=128
-LOCAL_BATCH_SIZE=4
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_vllava
-RUN_NAME=siglip_tcv35_7b_16f
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero3.json \
-    --model_type videollama2_qwen2 \
-    --model_path Qwen/Qwen2-7B-Instruct \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --mm_projector_type stc_connector_v35 \
-    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
-    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
-    --data_folder ${DATA_DIR}/videollava_sft/ \
-    --mm_vision_select_layer -2 \
-    --image_aspect_ratio pad \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --save_strategy "steps" \
-    --save_steps 500 \
-    --save_total_limit 99 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \

third_party/VideoLLaMA2/scripts/vllava/pretrain.sh DELETED Viewed

@@ -1,73 +0,0 @@
-#!/bin/bash
-# Environment Variables
-ARG_WORLD_SIZE=${1:-1}
-ARG_NPROC_PER_NODE=${2:-8}
-ARG_MASTER_ADDR="127.0.0.1"
-ARG_MASTER_PORT=16666
-ARG_RANK=${3:-0}
-# Multiple conditions
-if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
-    WORLD_SIZE=$ARG_WORLD_SIZE
-    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
-fi
-if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
-    MASTER_ADDR=$ARG_MASTER_ADDR
-    MASTER_PORT=$ARG_MASTER_PORT
-    RANK=$ARG_RANK
-fi
-echo "WORLD_SIZE: $WORLD_SIZE"
-echo "NPROC_PER_NODE: $NPROC_PER_NODE"
-# Training Arguments
-GLOBAL_BATCH_SIZE=256
-LOCAL_BATCH_SIZE=8
-GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
-# Log Arguments
-export TRANSFORMERS_OFFLINE=1
-export WANDB_PROJECT=videollama2qwen2_vllava
-RUN_NAME=siglip_tcv35_7b_16f
-DATA_DIR=datasets
-OUTP_DIR=work_dirs
-torchrun --nnodes $WORLD_SIZE \
-    --nproc_per_node $NPROC_PER_NODE  \
-    --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
-    --node_rank $RANK \
-    videollama2/train.py \
-    --deepspeed scripts/zero3.json \
-    --model_type videollama2_qwen2 \
-    --model_path Qwen/Qwen2-7B-Instruct \
-    --vision_tower google/siglip-so400m-patch14-384 \
-    --mm_projector_type stc_connector_v35 \
-    --tune_mm_mlp_adapter True \
-    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
-    --data_folder ${DATA_DIR}/videollava_pt/ \
-    --mm_vision_select_layer -2 \
-    --num_frames 16 \
-    --bf16 True \
-    --tf32 True \
-    --fp16 False \
-    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 500 \
-    --save_total_limit 99 \
-    --learning_rate 1e-3 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --dataloader_num_workers 4 \
-    --report_to tensorboard \
-    --run_name $RUN_NAME \