Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

f022e05

verified ·

1 Parent(s): dec51b2

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -17

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import gradio as gr
 import torch
 import numpy as np
 import cv2
 import time
 import re
-import spaces
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
@@ -12,7 +12,7 @@ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIter
 #####################################
 # 1. Load Model & Processor
 #####################################
-MODEL_ID = "google/gemma-3-12b-it"  # Adjust to your needs
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -23,18 +23,32 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
 model.eval()
 #####################################
-# 2. Helper Function: Capture Live Frames
 #####################################
 def capture_live_frames(duration=5, num_frames=10):
     """
-    Captures live frames from the default webcam for a specified duration.
-    Returns a list of (PIL image, timestamp) tuples.
     """
-    cap = cv2.VideoCapture(0)  # Use default webcam
-    if not cap.isOpened():
-        return []
-    # Try to get FPS, default to 30 if not available.
     fps = cap.get(cv2.CAP_PROP_FPS)
     if fps <= 0:
         fps = 30
@@ -50,20 +64,19 @@ def capture_live_frames(duration=5, num_frames=10):
         if not ret:
             break
         if frame_count in frame_indices:
-            # Convert BGR (OpenCV) to RGB (PIL)
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(frame_rgb)
             timestamp = round(frame_count / fps, 2)
             captured_frames.append((pil_image, timestamp))
         frame_count += 1
-        # Break if the elapsed time exceeds the duration.
         if time.time() - start_time > duration:
             break
     cap.release()
     return captured_frames
 #####################################
-# 3. Live Inference Function
 #####################################
 @spaces.GPU
 def live_inference(duration=5):
@@ -74,7 +87,7 @@ def live_inference(duration=5):
     if not frames:
         return "Could not capture live frames from the webcam."
-    # Build prompt using the captured frames.
     messages = [{
         "role": "user",
         "content": [{"type": "text", "text": "Please describe what's happening in this live video."}]
@@ -93,7 +106,7 @@ def live_inference(duration=5):
         padding=True
     ).to("cuda")
-    # Generate text using streaming.
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
@@ -108,7 +121,7 @@ def live_inference(duration=5):
     return generated_text
 #####################################
-# 4. Build Gradio Live App
 #####################################
 def build_live_app():
     with gr.Blocks() as demo:
@@ -119,7 +132,7 @@ def build_live_app():
             output_text = gr.Textbox(label="Model Output")
             restart_btn = gr.Button("Start Again", visible=False)
-        # This function triggers the live inference and also makes the restart button visible.
         def start_inference(duration):
             text = live_inference(duration)
             return text, gr.update(visible=True)
@@ -130,4 +143,4 @@ def build_live_app():
 if __name__ == "__main__":
     app = build_live_app()
-    app.launch(debug=True)

 import torch
 import numpy as np
 import cv2
+import spaces
 import time
 import re
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 #####################################
 # 1. Load Model & Processor
 #####################################
+MODEL_ID = "google/gemma-3-12b-it"  # Adjust model ID as needed
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
 model.eval()
 #####################################
+# 2. Helper Function: Get a Working Camera
+#####################################
+def get_working_camera():
+    """
+    Tries camera indices 0, 1, and 2 until a working camera is found.
+    Returns the VideoCapture object or None if no camera can be opened.
+    """
+    for i in range(3):
+        cap = cv2.VideoCapture(i)
+        if cap.isOpened():
+            return cap
+    return None
+#####################################
+# 3. Helper Function: Capture Live Frames
 #####################################
 def capture_live_frames(duration=5, num_frames=10):
     """
+    Captures live frames from a working webcam for a specified duration.
+    Returns a list of (PIL Image, timestamp) tuples.
     """
+    cap = get_working_camera()
+    if cap is None:
+        return []  # No working camera found
+    # Try to get FPS; default to 30 if not available.
     fps = cap.get(cv2.CAP_PROP_FPS)
     if fps <= 0:
         fps = 30
         if not ret:
             break
         if frame_count in frame_indices:
+            # Convert from BGR to RGB for PIL
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(frame_rgb)
             timestamp = round(frame_count / fps, 2)
             captured_frames.append((pil_image, timestamp))
         frame_count += 1
         if time.time() - start_time > duration:
             break
     cap.release()
     return captured_frames
 #####################################
+# 4. Live Inference Function
 #####################################
 @spaces.GPU
 def live_inference(duration=5):
     if not frames:
         return "Could not capture live frames from the webcam."
+    # Build prompt using captured frames and timestamps.
     messages = [{
         "role": "user",
         "content": [{"type": "text", "text": "Please describe what's happening in this live video."}]
         padding=True
     ).to("cuda")
+    # Generate text output using a streaming approach.
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
     return generated_text
 #####################################
+# 5. Build Gradio Live App
 #####################################
 def build_live_app():
     with gr.Blocks() as demo:
             output_text = gr.Textbox(label="Model Output")
             restart_btn = gr.Button("Start Again", visible=False)
+        # Function to trigger live inference and reveal the restart button
         def start_inference(duration):
             text = live_inference(duration)
             return text, gr.update(visible=True)
 if __name__ == "__main__":
     app = build_live_app()
+    app.launch(debug=True, share=True)