Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,682 Bytes
eec1ab5 17d4493 eec1ab5 21fd64b c47030c 621c8c9 17d4493 b841cef bb1a1ab 1488686 bb1a1ab 1488686 bb1a1ab fd74da5 bb1a1ab 17d4493 21fd64b 17d4493 4fa7fae 17d4493 fd74da5 bb1a1ab 333bbc5 17d4493 aac8d68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import spaces
import gradio as gr
import subprocess # 🥲
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
# subprocess.run(
# "pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git",
# shell=True,
# )
import torch
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
import copy
import warnings
from decord import VideoReader, cpu
import numpy as np
import tempfile
import os
import shutil
#warnings.filterwarnings("ignore")
title = "# 🙋🏻♂️Welcome to 🌟Tonic's 🌋📹LLaVA-Video!"
description1 ="""The **🌋📹LLaVA-Video-7B-Qwen2** is a 7B parameter model trained on the 🌋📹LLaVA-Video-178K dataset and the LLaVA-OneVision dataset. It is [based on the **Qwen2 language model**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f), supporting a context window of up to 32K tokens. The model can process and interact with images, multi-images, and videos, with specific optimizations for video analysis.
This model leverages the **SO400M vision backbone** for visual input and Qwen2 for language processing, making it highly efficient in multi-modal reasoning, including visual and video-based tasks.
🌋📹LLaVA-Video has larger variants of [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) and [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) and with a [variant](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) only trained on the new synthetic data
For further details, please visit the [Project Page](https://github.com/LLaVA-VL/LLaVA-NeXT) or check out the corresponding [research paper](https://arxiv.org/abs/2410.02713).
- **Architecture**: `LlavaQwenForCausalLM`
- **Attention Heads**: 28
- **Hidden Layers**: 28
- **Hidden Size**: 3584
"""
description2 ="""
- **Intermediate Size**: 18944
- **Max Frames Supported**: 64
- **Languages Supported**: English, Chinese
- **Image Aspect Ratio**: `anyres_max_9`
- **Image Resolution**: Various grid resolutions
- **Max Position Embeddings**: 32,768
- **Vocab Size**: 152,064
- **Model Precision**: bfloat16
- **Hardware Used for Training**: 256 * Nvidia Tesla A100 GPUs
"""
join_us = """
## Join us :
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
if max_frames_num == 0:
return np.zeros((1, 336, 336, 3))
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_frame_num = len(vr)
video_time = total_frame_num / vr.get_avg_fps()
fps = round(vr.get_avg_fps()/fps)
frame_idx = [i for i in range(0, len(vr), fps)]
frame_time = [i/fps for i in frame_idx]
if len(frame_idx) > max_frames_num or force_sample:
sample_fps = max_frames_num
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
frame_idx = uniform_sampled_frames.tolist()
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
spare_frames = vr.get_batch(frame_idx).asnumpy()
return spare_frames, frame_time, video_time
# Load the model
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
model_name = "llava_qwen"
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = "auto"
print("Loading model...")
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
model.eval()
print("Model loaded successfully!")
@spaces.GPU
def process_video(video_path, question):
max_frames_num = 64
video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
video = [video]
conv_template = "qwen_1_5"
time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}. Please answer the following questions related to this video."
full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question}"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], full_question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
with torch.no_grad():
output = model.generate(
input_ids,
images=video,
modalities=["video"],
do_sample=False,
temperature=0,
max_new_tokens=4096,
)
response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
return response
def gradio_interface(video_file, question):
if video_file is None:
return "Please upload a video file."
response = process_video(video_file, question)
return response
with gr.Blocks() as demo:
gr.Markdown(title)
with gr.Row():
with gr.Group():
gr.Markdown(description1)
with gr.Group():
gr.Markdown(description2)
with gr.Accordion("Join Us", open=False):
gr.Markdown(join_us)
with gr.Row():
with gr.Column():
video_input = gr.Video()
question_input = gr.Textbox(label="🙋🏻♂️User Question", placeholder="Ask a question about the video...")
submit_button = gr.Button("Ask🌋📹LLaVA-Video")
output = gr.Textbox(label="🌋📹LLaVA-Video")
submit_button.click(
fn=gradio_interface,
inputs=[video_input, question_input],
outputs=output
)
if __name__ == "__main__":
demo.launch(show_error=True, ssr_mode = False) |