Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

File size: 5,613 Bytes

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from threading import Thread
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)
import spaces
import time

# Load Model & Processor
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda")
model.eval()

# Helper Function: Downsample Video
def downsample_video(video_path, max_duration=10, num_frames=10):
    """
    Downsamples the video to `num_frames` evenly spaced frames within the first `max_duration` seconds.
    Returns a list of (PIL Image, timestamp) tuples.
    """
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    if fps <= 0 or total_frames <= 0:
        vidcap.release()
        return []

    # Limit to first `max_duration` seconds
    max_frames = min(int(fps * max_duration), total_frames)
    frame_indices = np.linspace(0, max_frames - 1, num_frames, dtype=int)

    frames = []
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = vidcap.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

# Inference Function
@spaces.GPU
def video_inference(video_file):
    """
    Processes the video file and generates a text description based on the first 10 seconds.
    Returns the generated text.
    """
    if video_file is None:
        return "No video provided."

    frames = downsample_video(video_file, max_duration=10, num_frames=10)
    if not frames:
        return "Could not read frames from video."

    # Construct prompt
    messages = [
        {
            "role": "user",
            "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
        }
    ]
    for (image, ts) in frames:
        messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
        messages[0]["content"].append({"type": "image", "image": image})

    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    frame_images = [img for (img, _) in frames]

    inputs = processor(
        text=[prompt],
        images=frame_images,
        return_tensors="pt",
        padding=True
    ).to("cuda")

    # Generate text with streaming
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    generated_text = ""
    for new_text in streamer:
        generated_text += new_text
        time.sleep(0.01)

    return generated_text

# Button Toggle Function
def toggle_button(has_result):
    """
    Returns visibility states for start_again_btn and start_btn based on has_result.
    """
    if has_result:
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)

# Build the Gradio App
def build_app():
    with gr.Blocks() as demo:
        gr.Markdown("""
        # **Gemma-3 Live Video Analysis**
        Press **Start** to record a short video clip (up to 10 seconds). Stop recording to see the analysis.
        After the result, press **Start Again** to analyze another clip.
        """)

        # State to track if a result has been generated
        has_result = gr.State(value=False)

        with gr.Row():
            with gr.Column():
                video = gr.Video(
                    sources=["webcam"],
                    label="Webcam Recording",
                    format="mp4"
                )
                # Two buttons: one for Start, one for Start Again
                start_btn = gr.Button("Start", visible=True)
                start_again_btn = gr.Button("Start Again", visible=False)
            with gr.Column():
                output_text = gr.Textbox(label="Model Output")

        # When video is recorded and stopped, process it
        def process_video(video_file, has_result_state):
            if video_file is None:
                return "Please record a video.", has_result_state
            result = video_inference(video_file)
            return result, True

        video.change(
            fn=process_video,
            inputs=[video, has_result],
            outputs=[output_text, has_result]
        )

        # Update button visibility based on has_result
        has_result.change(
            fn=toggle_button,
            inputs=has_result,
            outputs=[start_again_btn, start_btn]
        )

        # Clicking either button resets the video and output
        def reset_state():
            return None, "", False

        start_btn.click(
            fn=reset_state,
            inputs=None,
            outputs=[video, output_text, has_result]
        )
        start_again_btn.click(
            fn=reset_state,
            inputs=None,
            outputs=[video, output_text, has_result]
        )

    return demo

if __name__ == "__main__":
    app = build_app()
    app.launch(debug=True)