Spaces:

huggingface-projects
/

gemma-3-12b-it

Running on Zero

hysts HF Staff commited on Mar 14

Commit

79f86c4

1 Parent(s): 00cae9a

Add media validation, improve code readability, and fix bugs

- Implement media input validation logic (image count limits, prohibit mixing videos and images)
- Reorganize code structure to improve readability
- Fix bug where user message was ignored when processing video input

Files changed (1) hide show

app.py +95 -33

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import re
 import tempfile
 from collections.abc import Iterator
@@ -13,12 +14,63 @@ from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
-model_id = "google/gemma-3-12b-it"
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
@@ -41,44 +93,50 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     return frames
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
-    if len([path for path in message["files"] if path.endswith(".mp4")]) > 1:
-        raise gr.Error("Only one video is supported at a time.")
     if "<image>" in message["text"]:
-        content = []
-        logger.debug(f"{message['files']=}")
-        parts = re.split(r"(<image>)", message["text"])
-        image_index = 0
-        logger.debug(f"{parts=}")
-        for part in parts:
-            logger.debug(f"{part=}")
-            if part == "<image>":
-                content.append({"type": "image", "url": message["files"][image_index]})
-                logger.debug(f"file: {message['files'][image_index]}")
-                image_index += 1
-            elif part.strip():
-                content.append({"type": "text", "text": part.strip()})
-            elif isinstance(part, str) and part != "<image>":
-                content.append({"type": "text", "text": part})
-        logger.debug(f"{content=}")
-        return content
-    if message["files"][0].endswith(".mp4"):
-        content = []
-        video = message["files"].pop(0)
-        frames = downsample_video(video)
-        for frame in frames:
-            pil_image, timestamp = frame
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-                pil_image.save(temp_file.name)
-                content.append({"type": "text", "text": f"Frame {timestamp}:"})
-                content.append({"type": "image", "url": temp_file.name})
-        logger.debug(f"{content=}")
-        return content
-    # non interleaved images
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
@@ -105,6 +163,10 @@ def process_history(history: list[dict]) -> list[dict]:
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})

 #!/usr/bin/env python
+import os
 import re
 import tempfile
 from collections.abc import Iterator
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
+MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
+def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
+    image_count = 0
+    video_count = 0
+    for path in paths:
+        if path.endswith(".mp4"):
+            video_count += 1
+        else:
+            image_count += 1
+    return image_count, video_count
+def count_files_in_history(history: list[dict]) -> tuple[int, int]:
+    image_count = 0
+    video_count = 0
+    for item in history:
+        if item["role"] != "user" or isinstance(item["content"], str):
+            continue
+        if item["content"][0].endswith(".mp4"):
+            video_count += 1
+        else:
+            image_count += 1
+    return image_count, video_count
+def validate_media_constraints(message: dict, history: list[dict]) -> bool:
+    new_image_count, new_video_count = count_files_in_new_message(message["files"])
+    history_image_count, history_video_count = count_files_in_history(history)
+    image_count = history_image_count + new_image_count
+    video_count = history_video_count + new_video_count
+    if video_count > 1:
+        gr.Warning("Only one video is supported.")
+        return False
+    if video_count == 1:
+        if image_count > 0:
+            gr.Warning("Mixing images and videos is not allowed.")
+            return False
+        if "<image>" in message["text"]:
+            gr.Warning("Using <image> tags with video files is not supported.")
+            return False
+        # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
+    if video_count == 0 and image_count > MAX_NUM_IMAGES:
+        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
+        return False
+    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
+        gr.Warning("The number of <image> tags in the text does not match the number of images.")
+        return False
+    return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     return frames
+def process_video(video_path: str) -> list[dict]:
+    content = []
+    frames = downsample_video(video_path)
+    for frame in frames:
+        pil_image, timestamp = frame
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
+            pil_image.save(temp_file.name)
+            content.append({"type": "text", "text": f"Frame {timestamp}:"})
+            content.append({"type": "image", "url": temp_file.name})
+    logger.debug(f"{content=}")
+    return content
+def process_interleaved_images(message: dict) -> list[dict]:
+    logger.debug(f"{message['files']=}")
+    parts = re.split(r"(<image>)", message["text"])
+    logger.debug(f"{parts=}")
+    content = []
+    image_index = 0
+    for part in parts:
+        logger.debug(f"{part=}")
+        if part == "<image>":
+            content.append({"type": "image", "url": message["files"][image_index]})
+            logger.debug(f"file: {message['files'][image_index]}")
+            image_index += 1
+        elif part.strip():
+            content.append({"type": "text", "text": part.strip()})
+        elif isinstance(part, str) and part != "<image>":
+            content.append({"type": "text", "text": part})
+    logger.debug(f"{content=}")
+    return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
+    if message["files"][0].endswith(".mp4"):
+        return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
+        return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
+    if not validate_media_constraints(message, history):
+        yield ""
+        return
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})