Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

f16ee26

verified ·

1 Parent(s): 868aa37

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -15

app.py CHANGED Viewed

@@ -14,16 +14,17 @@ from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
-css= '''h1 {
   text-align: center;
   display: block;
 }
@@ -37,7 +38,6 @@ css= '''h1 {
 '''
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
@@ -77,7 +77,6 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         if "<image>" in message["text"]:
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
-        # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
@@ -91,19 +90,17 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_interval = int(fps / 3)
     frames = []
     for i in range(0, total_frames, frame_interval):
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
@@ -125,7 +122,6 @@ def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
@@ -145,13 +141,10 @@ def process_interleaved_images(message: dict) -> list[dict]:
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
@@ -173,9 +166,18 @@ def process_history(history: list[dict]) -> list[dict]:
                 current_user_content.append({"type": "text", "text": content})
             else:
                 current_user_content.append({"type": "image", "url": content[0]})
     return messages
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
@@ -198,11 +200,12 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     output = ""

 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+# Load model and processor.
 model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
+model.eval()  # Ensure the model is in evaluation mode.
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
+css = '''h1 {
   text-align: center;
   display: block;
 }
 '''
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
         if "<image>" in message["text"]:
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Calculate frame interval (approximately one frame every 1/3 second).
+    frame_interval = int(fps / 3) if fps > 0 else 1
     frames = []
     for i in range(0, total_frames, frame_interval):
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2) if fps > 0 else 0
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
                 current_user_content.append({"type": "text", "text": content})
             else:
                 current_user_content.append({"type": "image", "url": content[0]})
+    if current_user_content:
+        messages.append({"role": "user", "content": current_user_content})
     return messages
+def generate_thread(generate_kwargs):
+    # Empty cache to free up memory and run generation under no_grad.
+    torch.cuda.empty_cache()
+    with torch.no_grad():
+        model.generate(**generate_kwargs)
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        inputs=inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
+    # Launch generation in a separate thread using our no_grad wrapper.
+    t = Thread(target=generate_thread, kwargs={"generate_kwargs": generate_kwargs})
     t.start()
     output = ""