Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 14

Commit

67f9a49

verified ·

1 Parent(s): 5935cac

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -43

app.py CHANGED Viewed

@@ -6,15 +6,16 @@ import matplotlib.pyplot as plt
 import random
 import spaces
 import time
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 #####################################
-# 1. Load Gemma3 Model & Processor
 #####################################
-MODEL_ID = "google/gemma-3-12b-it"  # Example placeholder
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -52,29 +53,6 @@ def downsample_video(video_path, num_frames=10):
     vidcap.release()
     return frames
-#####################################
-# 2.5: Parse Categories from Model Output
-#####################################
-def parse_inferred_categories(generated_text):
-    """
-    A naive parser that looks for lines starting with 'Category:'
-    and collects the text after that as the category name.
-    Example lines in model output:
-        Category: Nutrition
-        Category: Outdoor Scenes
-    Returns a list of category strings.
-    """
-    categories = []
-    for line in generated_text.split("\n"):
-        line = line.strip()
-        # Check if the line starts with 'Category:' (case-insensitive)
-        if line.lower().startswith("category:"):
-            # Extract everything after 'Category:'
-            cat = line.split(":", 1)[1].strip()
-            if cat:
-                categories.append(cat)
-    return categories
 #####################################
 # 3. The Inference Function
 #####################################
@@ -82,8 +60,8 @@ def parse_inferred_categories(generated_text):
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
-    - Downsamples the video, passes frames to the Gemma3 model for inference.
-    - Returns model-generated text + a bar chart with categories derived from that text.
     """
     if video_file is None:
         return "No video provided.", None
@@ -100,6 +78,7 @@ def video_inference(video_file, duration):
             "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
         }
     ]
     # Add frames (with timestamp) to the messages
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
@@ -108,7 +87,7 @@ def video_inference(video_file, duration):
     # Prepare final prompt
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Collect images for model
     frame_images = [img for (img, _) in frames]
     inputs = processor(
@@ -130,23 +109,37 @@ def video_inference(video_file, duration):
         generated_text += new_text
         time.sleep(0.01)
-    # 3.4: Parse categories from model output
-    categories = parse_inferred_categories(generated_text)
-    # If no categories were found, use fallback
-    if not categories:
-        categories = ["Category A", "Category B", "Category C"]
-    # Create dummy values for each category
-    values = [random.randint(1, 10) for _ in categories]
-    # 3.5: Create bar chart
     fig, ax = plt.subplots()
-    ax.bar(categories, values, color=["#4B0082", "#9370DB", "#4B0082"]*(len(categories)//3+1))
-    ax.set_title("Inferred Categories from Model Output")
-    ax.set_ylabel("Value")
-    ax.set_xlabel("Categories")
-    plt.xticks(rotation=30, ha="right")
     return generated_text, fig
 #####################################
@@ -155,7 +148,7 @@ def video_inference(video_file, duration):
 def build_app():
     with gr.Blocks() as demo:
         gr.Markdown("""
-        # **Gemma3 (or Qwen2.5-VL) Live Video Analysis**
         Record a video (from webcam or file), then click **Stop**.
         Next, click **Analyze** to run the model and see textual + chart outputs.
         """)
@@ -168,8 +161,9 @@ def build_app():
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
                 video = gr.Video(
-                    label="Webcam Recording (press Record, then Stop)",
                     format="mp4"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")

 import random
 import spaces
 import time
+import re
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 #####################################
+# 1. Load Model & Processor
 #####################################
+MODEL_ID = "google/gemma-3-12b-it"  # Example model ID (adjust to your needs)
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
     vidcap.release()
     return frames
 #####################################
 # 3. The Inference Function
 #####################################
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
+    - Downsamples the video, passes frames to the model for inference.
+    - Returns model-generated text + a bar chart based on the text.
     """
     if video_file is None:
         return "No video provided.", None
             "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
         }
     ]
     # Add frames (with timestamp) to the messages
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
     # Prepare final prompt
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Gather images for the model
     frame_images = [img for (img, _) in frames]
     inputs = processor(
         generated_text += new_text
         time.sleep(0.01)
+    # 3.4: Build a bar chart based on top keywords from the generated text
+    #      (Naive approach: frequency of top 5 words)
+    words = re.findall(r'\w+', generated_text.lower())
+    freq = {}
+    for w in words:
+        freq[w] = freq.get(w, 0) + 1
+    # Sort words by frequency (descending)
+    sorted_items = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+    # Pick top 5 words (if fewer than 5, pick all)
+    top5 = sorted_items[:5]
+    if not top5:
+        # If there's no text or no valid words, return no chart
+        return generated_text, None
+    categories = [item[0] for item in top5]
+    values = [item[1] for item in top5]
+    # Create the figure
     fig, ax = plt.subplots()
+    colors = ["#4B0082", "#9370DB", "#8A2BE2", "#DA70D6", "#BA55D3"]  # Purple-ish palette
+    # Make sure we have enough colors for the number of bars
+    color_list = colors[: len(categories)]
+    ax.bar(categories, values, color=color_list)
+    ax.set_title("Top Keywords in Generated Description")
+    ax.set_ylabel("Frequency")
+    ax.set_xlabel("Keyword")
+    # Return the final text and the figure
     return generated_text, fig
 #####################################
 def build_app():
     with gr.Blocks() as demo:
         gr.Markdown("""
+        # **Gemma-3 (Example) Live Video Analysis**
         Record a video (from webcam or file), then click **Stop**.
         Next, click **Analyze** to run the model and see textual + chart outputs.
         """)
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
+                # For older Gradio versions, avoid `source="webcam"`.
                 video = gr.Video(
+                    label="Webcam Recording (press the Record button, then Stop)",
                     format="mp4"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")