Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,613 Bytes
b8a0d2d 8716c2f d82ce04 8081540 8716c2f 8081540 d82ce04 8716c2f d82ce04 8716c2f 8081540 f022e05 8081540 f022e05 8716c2f 8081540 dec51b2 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 8716c2f 8081540 652a69b 8716c2f 8081540 284b710 8081540 284b710 8081540 8716c2f 8081540 284b710 8081540 284b710 8081540 284b710 8081540 8716c2f b8a0d2d 8081540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from threading import Thread
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
Qwen2VLForConditionalGeneration,
AutoProcessor,
)
import spaces
import time
# Load Model & Processor
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.bfloat16
).to("cuda")
model.eval()
# Helper Function: Downsample Video
def downsample_video(video_path, max_duration=10, num_frames=10):
"""
Downsamples the video to `num_frames` evenly spaced frames within the first `max_duration` seconds.
Returns a list of (PIL Image, timestamp) tuples.
"""
vidcap = cv2.VideoCapture(video_path)
fps = vidcap.get(cv2.CAP_PROP_FPS)
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
if fps <= 0 or total_frames <= 0:
vidcap.release()
return []
# Limit to first `max_duration` seconds
max_frames = min(int(fps * max_duration), total_frames)
frame_indices = np.linspace(0, max_frames - 1, num_frames, dtype=int)
frames = []
for i in frame_indices:
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
success, image = vidcap.read()
if success:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image)
timestamp = round(i / fps, 2)
frames.append((pil_image, timestamp))
vidcap.release()
return frames
# Inference Function
@spaces.GPU
def video_inference(video_file):
"""
Processes the video file and generates a text description based on the first 10 seconds.
Returns the generated text.
"""
if video_file is None:
return "No video provided."
frames = downsample_video(video_file, max_duration=10, num_frames=10)
if not frames:
return "Could not read frames from video."
# Construct prompt
messages = [
{
"role": "user",
"content": [{"type": "text", "text": "Please describe what's happening in this video."}]
}
]
for (image, ts) in frames:
messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
messages[0]["content"].append({"type": "image", "image": image})
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
frame_images = [img for (img, _) in frames]
inputs = processor(
text=[prompt],
images=frame_images,
return_tensors="pt",
padding=True
).to("cuda")
# Generate text with streaming
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:
generated_text += new_text
time.sleep(0.01)
return generated_text
# Button Toggle Function
def toggle_button(has_result):
"""
Returns visibility states for start_again_btn and start_btn based on has_result.
"""
if has_result:
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
# Build the Gradio App
def build_app():
with gr.Blocks() as demo:
gr.Markdown("""
# **Gemma-3 Live Video Analysis**
Press **Start** to record a short video clip (up to 10 seconds). Stop recording to see the analysis.
After the result, press **Start Again** to analyze another clip.
""")
# State to track if a result has been generated
has_result = gr.State(value=False)
with gr.Row():
with gr.Column():
video = gr.Video(
sources=["webcam"],
label="Webcam Recording",
format="mp4"
)
# Two buttons: one for Start, one for Start Again
start_btn = gr.Button("Start", visible=True)
start_again_btn = gr.Button("Start Again", visible=False)
with gr.Column():
output_text = gr.Textbox(label="Model Output")
# When video is recorded and stopped, process it
def process_video(video_file, has_result_state):
if video_file is None:
return "Please record a video.", has_result_state
result = video_inference(video_file)
return result, True
video.change(
fn=process_video,
inputs=[video, has_result],
outputs=[output_text, has_result]
)
# Update button visibility based on has_result
has_result.change(
fn=toggle_button,
inputs=has_result,
outputs=[start_again_btn, start_btn]
)
# Clicking either button resets the video and output
def reset_state():
return None, "", False
start_btn.click(
fn=reset_state,
inputs=None,
outputs=[video, output_text, has_result]
)
start_again_btn.click(
fn=reset_state,
inputs=None,
outputs=[video, output_text, has_result]
)
return demo
if __name__ == "__main__":
app = build_app()
app.launch(debug=True) |