File size: 2,768 Bytes
e95a3a8
 
5bf65b0
 
e95a3a8
5bf65b0
 
b04464a
e274fdd
e95a3a8
5bf65b0
 
 
 
 
e95a3a8
 
 
 
5bf65b0
 
 
 
 
 
 
 
 
 
 
 
 
 
e95a3a8
5bf65b0
e95a3a8
5bf65b0
 
 
 
 
 
 
 
e95a3a8
b04464a
5bf65b0
e274fdd
 
8edb003
 
 
 
 
 
e95a3a8
5bf65b0
 
 
e95a3a8
5bf65b0
 
 
 
 
 
 
 
 
 
 
e95a3a8
5bf65b0
e95a3a8
e274fdd
 
 
 
e95a3a8
5bf65b0
 
e95a3a8
5bf65b0
 
 
 
e274fdd
e95a3a8
 
5bf65b0
e95a3a8
5bf65b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from decord import VideoReader, cpu
import base64
import io
import spaces
import time

# Load model
model_path = 'openbmb/MiniCPM-V-2_6'
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
model = model.to(device='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.eval()

MAX_NUM_FRAMES = 64

def encode_image(image):
    if not isinstance(image, Image.Image):
        image = Image.open(image).convert("RGB")
    max_size = 448*16 
    if max(image.size) > max_size:
        w,h = image.size
        if w > h:
            new_w = max_size
            new_h = int(h * max_size / w)
        else:
            new_h = max_size
            new_w = int(w * max_size / h)
        image = image.resize((new_w, new_h), resample=Image.BICUBIC)
    return image

def encode_video(video_path):
    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = frame_idx[:MAX_NUM_FRAMES]
    video = vr.get_batch(frame_idx).asnumpy()
    video = [Image.fromarray(v.astype('uint8')) for v in video]
    video = [encode_image(v) for v in video]
    return video

@spaces.GPU
def analyze_video(prompt, video):
    start_time = time.time()
    
    if isinstance(video, str):
        video_path = video
    else:
        video_path = video.name
    
    encoded_video = encode_video(video_path)
    
    context = [
        {"role": "user", "content": [prompt] + encoded_video}
    ]
    
    params = {
        'sampling': True,
        'top_p': 0.8,
        'top_k': 100,
        'temperature': 0.7,
        'repetition_penalty': 1.05,
        "max_new_tokens": 2048,
        "max_inp_length": 4352,
        "use_image_id": False,
        "max_slice_nums": 1 if len(encoded_video) > 16 else 2
    }
    
    response = model.chat(image=None, msgs=context, tokenizer=tokenizer, **params)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return f"Analysis Result:\n{response}\n\nProcessing Time: {processing_time:.2f} seconds"

with gr.Blocks() as demo:
    gr.Markdown("# Video Analyzer")
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt")
            video_input = gr.Video(label="Upload Video")
        with gr.Column():
            output = gr.Textbox(label="Analysis Result and Processing Time")
    
    analyze_button = gr.Button("Analyze Video")
    analyze_button.click(fn=analyze_video, inputs=[prompt_input, video_input], outputs=output)

demo.launch()