import gradio as gr import torch import numpy as np import cv2 import time import re from PIL import Image from threading import Thread from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer ##################################### # 1. Load Model & Processor ##################################### MODEL_ID = "google/gemma-3-12b-it" # Adjust to your needs processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = Gemma3ForConditionalGeneration.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16 ).to("cuda") model.eval() ##################################### # 2. Helper Function: Capture Live Frames ##################################### def capture_live_frames(duration=5, num_frames=10): """ Captures live frames from the default webcam for a specified duration. Returns a list of (PIL image, timestamp) tuples. """ cap = cv2.VideoCapture(0) # Use default webcam if not cap.isOpened(): return [] # Try to get FPS, default to 30 if not available. fps = cap.get(cv2.CAP_PROP_FPS) if fps <= 0: fps = 30 total_frames_to_capture = int(duration * fps) frame_indices = np.linspace(0, total_frames_to_capture - 1, num_frames, dtype=int) captured_frames = [] frame_count = 0 start_time = time.time() while frame_count < total_frames_to_capture: ret, frame = cap.read() if not ret: break if frame_count in frame_indices: # Convert BGR (OpenCV) to RGB (PIL) frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(frame_rgb) timestamp = round(frame_count / fps, 2) captured_frames.append((pil_image, timestamp)) frame_count += 1 # Break if the elapsed time exceeds the duration. if time.time() - start_time > duration: break cap.release() return captured_frames ##################################### # 3. Live Inference Function ##################################### def live_inference(duration=5): """ Captures live frames from the webcam, builds a prompt, and returns the generated text. """ frames = capture_live_frames(duration=duration, num_frames=10) if not frames: return "Could not capture live frames from the webcam." # Build prompt using the captured frames. messages = [{ "role": "user", "content": [{"type": "text", "text": "Please describe what's happening in this live video."}] }] for (image, ts) in frames: messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"}) messages[0]["content"].append({"type": "image", "image": image}) prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) frame_images = [img for (img, _) in frames] inputs = processor( text=[prompt], images=frame_images, return_tensors="pt", padding=True ).to("cuda") # Generate text using streaming. streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() generated_text = "" for new_text in streamer: generated_text += new_text time.sleep(0.01) return generated_text ##################################### # 4. Build Gradio Live App ##################################### def build_live_app(): with gr.Blocks() as demo: gr.Markdown("# **Live Video Analysis**\n\nPress **Start** to capture a few seconds of live video from your webcam and analyze the content.") with gr.Column(): duration_input = gr.Number(label="Capture Duration (seconds)", value=5, precision=0) start_btn = gr.Button("Start") output_text = gr.Textbox(label="Model Output") restart_btn = gr.Button("Start Again", visible=False) # This function triggers the live inference and also makes the restart button visible. def start_inference(duration): text = live_inference(duration) return text, gr.update(visible=True) start_btn.click(fn=start_inference, inputs=duration_input, outputs=[output_text, restart_btn]) restart_btn.click(fn=start_inference, inputs=duration_input, outputs=[output_text, restart_btn]) return demo if __name__ == "__main__": app = build_live_app() app.launch(debug=True)