Spaces:
Runtime error
Runtime error
File size: 2,768 Bytes
e95a3a8 5bf65b0 e95a3a8 5bf65b0 b04464a e274fdd e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 b04464a 5bf65b0 e274fdd 8edb003 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 e274fdd e95a3a8 5bf65b0 e95a3a8 5bf65b0 e274fdd e95a3a8 5bf65b0 e95a3a8 5bf65b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from decord import VideoReader, cpu
import base64
import io
import spaces
import time
# Load model
model_path = 'openbmb/MiniCPM-V-2_6'
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
model = model.to(device='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.eval()
MAX_NUM_FRAMES = 64
def encode_image(image):
if not isinstance(image, Image.Image):
image = Image.open(image).convert("RGB")
max_size = 448*16
if max(image.size) > max_size:
w,h = image.size
if w > h:
new_w = max_size
new_h = int(h * max_size / w)
else:
new_h = max_size
new_w = int(w * max_size / h)
image = image.resize((new_w, new_h), resample=Image.BICUBIC)
return image
def encode_video(video_path):
vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1)
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if len(frame_idx) > MAX_NUM_FRAMES:
frame_idx = frame_idx[:MAX_NUM_FRAMES]
video = vr.get_batch(frame_idx).asnumpy()
video = [Image.fromarray(v.astype('uint8')) for v in video]
video = [encode_image(v) for v in video]
return video
@spaces.GPU
def analyze_video(prompt, video):
start_time = time.time()
if isinstance(video, str):
video_path = video
else:
video_path = video.name
encoded_video = encode_video(video_path)
context = [
{"role": "user", "content": [prompt] + encoded_video}
]
params = {
'sampling': True,
'top_p': 0.8,
'top_k': 100,
'temperature': 0.7,
'repetition_penalty': 1.05,
"max_new_tokens": 2048,
"max_inp_length": 4352,
"use_image_id": False,
"max_slice_nums": 1 if len(encoded_video) > 16 else 2
}
response = model.chat(image=None, msgs=context, tokenizer=tokenizer, **params)
end_time = time.time()
processing_time = end_time - start_time
return f"Analysis Result:\n{response}\n\nProcessing Time: {processing_time:.2f} seconds"
with gr.Blocks() as demo:
gr.Markdown("# Video Analyzer")
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(label="Prompt")
video_input = gr.Video(label="Upload Video")
with gr.Column():
output = gr.Textbox(label="Analysis Result and Processing Time")
analyze_button = gr.Button("Analyze Video")
analyze_button.click(fn=analyze_video, inputs=[prompt_input, video_input], outputs=output)
demo.launch()
|