Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

8081540

verified ·

1 Parent(s): f022e05

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -98

app.py CHANGED Viewed

@@ -2,18 +2,14 @@ import gradio as gr
 import torch
 import numpy as np
 import cv2
-import spaces
-import time
-import re
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
-#####################################
-# 1. Load Model & Processor
-#####################################
-MODEL_ID = "google/gemma-3-12b-it"  # Adjust model ID as needed
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -22,125 +18,155 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
 ).to("cuda")
 model.eval()
-#####################################
-# 2. Helper Function: Get a Working Camera
-#####################################
-def get_working_camera():
-    """
-    Tries camera indices 0, 1, and 2 until a working camera is found.
-    Returns the VideoCapture object or None if no camera can be opened.
     """
-    for i in range(3):
-        cap = cv2.VideoCapture(i)
-        if cap.isOpened():
-            return cap
-    return None
-#####################################
-# 3. Helper Function: Capture Live Frames
-#####################################
-def capture_live_frames(duration=5, num_frames=10):
-    """
-    Captures live frames from a working webcam for a specified duration.
     Returns a list of (PIL Image, timestamp) tuples.
     """
-    cap = get_working_camera()
-    if cap is None:
-        return []  # No working camera found
-    # Try to get FPS; default to 30 if not available.
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    if fps <= 0:
-        fps = 30
-    total_frames_to_capture = int(duration * fps)
-    frame_indices = np.linspace(0, total_frames_to_capture - 1, num_frames, dtype=int)
-    captured_frames = []
-    frame_count = 0
-    start_time = time.time()
-    while frame_count < total_frames_to_capture:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if frame_count in frame_indices:
-            # Convert from BGR to RGB for PIL
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(frame_rgb)
-            timestamp = round(frame_count / fps, 2)
-            captured_frames.append((pil_image, timestamp))
-        frame_count += 1
-        if time.time() - start_time > duration:
-            break
-    cap.release()
-    return captured_frames
-#####################################
-# 4. Live Inference Function
-#####################################
 @spaces.GPU
-def live_inference(duration=5):
     """
-    Captures live frames from the webcam, builds a prompt, and returns the generated text.
     """
-    frames = capture_live_frames(duration=duration, num_frames=10)
     if not frames:
-        return "Could not capture live frames from the webcam."
-    # Build prompt using captured frames and timestamps.
-    messages = [{
-        "role": "user",
-        "content": [{"type": "text", "text": "Please describe what's happening in this live video."}]
-    }]
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
         messages[0]["content"].append({"type": "image", "image": image})
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     frame_images = [img for (img, _) in frames]
     inputs = processor(
         text=[prompt],
         images=frame_images,
         return_tensors="pt",
         padding=True
     ).to("cuda")
-    # Generate text output using a streaming approach.
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     generated_text = ""
     for new_text in streamer:
         generated_text += new_text
         time.sleep(0.01)
     return generated_text
-#####################################
-# 5. Build Gradio Live App
-#####################################
-def build_live_app():
     with gr.Blocks() as demo:
-        gr.Markdown("# **Live Video Analysis**\n\nPress **Start** to capture a few seconds of live video from your webcam and analyze the content.")
-        with gr.Column():
-            duration_input = gr.Number(label="Capture Duration (seconds)", value=5, precision=0)
-            start_btn = gr.Button("Start")
-            output_text = gr.Textbox(label="Model Output")
-            restart_btn = gr.Button("Start Again", visible=False)
-        # Function to trigger live inference and reveal the restart button
-        def start_inference(duration):
-            text = live_inference(duration)
-            return text, gr.update(visible=True)
-        start_btn.click(fn=start_inference, inputs=duration_input, outputs=[output_text, restart_btn])
-        restart_btn.click(fn=start_inference, inputs=duration_input, outputs=[output_text, restart_btn])
     return demo
 if __name__ == "__main__":
-    app = build_live_app()
-    app.launch(debug=True, share=True)

 import torch
 import numpy as np
 import cv2
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+import spaces
+import time
+# Load Model & Processor
+MODEL_ID = "google/gemma-3-12b-it"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
     MODEL_ID,
 ).to("cuda")
 model.eval()
+# Helper Function: Downsample Video
+def downsample_video(video_path, max_duration=10, num_frames=10):
     """
+    Downsamples the video to `num_frames` evenly spaced frames within the first `max_duration` seconds.
     Returns a list of (PIL Image, timestamp) tuples.
     """
+    vidcap = cv2.VideoCapture(video_path)
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps <= 0 or total_frames <= 0:
+        vidcap.release()
+        return []
+    # Limit to first `max_duration` seconds
+    max_frames = min(int(fps * max_duration), total_frames)
+    frame_indices = np.linspace(0, max_frames - 1, num_frames, dtype=int)
+    frames = []
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+# Inference Function
 @spaces.GPU
+def video_inference(video_file):
     """
+    Processes the video file and generates a text description based on the first 10 seconds.
+    Returns the generated text.
     """
+    if video_file is None:
+        return "No video provided."
+    frames = downsample_video(video_file, max_duration=10, num_frames=10)
     if not frames:
+        return "Could not read frames from video."
+    # Construct prompt
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
+        }
+    ]
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
         messages[0]["content"].append({"type": "image", "image": image})
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     frame_images = [img for (img, _) in frames]
     inputs = processor(
         text=[prompt],
         images=frame_images,
         return_tensors="pt",
         padding=True
     ).to("cuda")
+    # Generate text with streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     generated_text = ""
     for new_text in streamer:
         generated_text += new_text
         time.sleep(0.01)
     return generated_text
+# Button Toggle Function
+def toggle_button(has_result):
+    """
+    Returns button label and visibility states based on whether a result has been generated.
+    """
+    if has_result:
+        return "Start Again", gr.Button(visible=True), gr.Button(visible=False)
+    return "Start", gr.Button(visible=False), gr.Button(visible=True)
+# Build the Gradio App
+def build_app():
     with gr.Blocks() as demo:
+        gr.Markdown("""
+        # **Gemma-3 Live Video Analysis**
+        Press **Start** to record a short video clip (up to 10 seconds). Stop recording to see the analysis.
+        After the result, press **Start Again** to analyze another clip.
+        """)
+        # State to track if a result has been generated
+        has_result = gr.State(value=False)
+        with gr.Row():
+            with gr.Column():
+                video = gr.Video(
+                    source="webcam",
+                    label="Webcam Recording",
+                    format="mp4"
+                )
+                # Two buttons: one for Start, one for Start Again
+                start_again_btn = gr.Button("Start Again", visible=False)
+                start_btn = gr.Button("Start", visible=True)
+            with gr.Column():
+                output_text = gr.Textbox(label="Model Output")
+        # When video is recorded and stopped, process it
+        def process_video(video_file, has_result_state):
+            if video_file is None:
+                return "Please record a video.", has_result_state
+            result = video_inference(video_file)
+            return result, True
+        video.change(
+            fn=process_video,
+            inputs=[video, has_result],
+            outputs=[output_text, has_result]
+        )
+        # Update button visibility based on has_result
+        has_result.change(
+            fn=toggle_button,
+            inputs=has_result,
+            outputs=[start_again_btn, start_again_btn, start_btn]
+        )
+        # Clicking either button resets the video and output
+        def reset_state():
+            return None, "", False
+        start_btn.click(
+            fn=reset_state,
+            inputs=None,
+            outputs=[video, output_text, has_result]
+        )
+        start_again_btn.click(
+            fn=reset_state,
+            inputs=None,
+            outputs=[video, output_text, has_result]
+        )
     return demo
 if __name__ == "__main__":
+    app = build_app()
+    app.launch(debug=True)