Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

fe76282

verified ·

1 Parent(s): a9ad97a

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -34

app.py CHANGED Viewed

@@ -14,13 +14,11 @@ from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
-# Load model and processor.
 model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
-model.eval()  # Set model to evaluation mode.
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
@@ -37,7 +35,6 @@ css = '''h1 {
 }
 '''
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
@@ -48,7 +45,6 @@ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
@@ -61,7 +57,6 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     new_image_count, new_video_count = count_files_in_new_message(message["files"])
     history_image_count, history_video_count = count_files_in_history(history)
@@ -85,26 +80,30 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Calculate frame interval (approximately one frame every 1/3 second).
-    frame_interval = int(fps / 3) if fps > 0 else 1
     frames = []
-    for i in range(0, total_frames, frame_interval):
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2) if fps > 0 else 0
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
@@ -117,11 +116,11 @@ def process_video(video_path: str) -> list[dict]:
     logger.debug(f"{content=}")
     return content
 def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
@@ -137,20 +136,21 @@ def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{content=}")
     return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
     ]
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
     current_user_content: list[dict] = []
@@ -166,18 +166,8 @@ def process_history(history: list[dict]) -> list[dict]:
                 current_user_content.append({"type": "text", "text": content})
             else:
                 current_user_content.append({"type": "image", "url": content[0]})
-    if current_user_content:
-        messages.append({"role": "user", "content": current_user_content})
     return messages
-def generate_thread(generate_kwargs):
-    # Clear cache and run generation under no_grad.
-    torch.cuda.empty_cache()
-    with torch.no_grad():
-        model.generate(**generate_kwargs)
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
@@ -190,21 +180,21 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
-    # Apply chat template and convert each tensor in the resulting dict.
-    raw_inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
-    )
-    inputs = {k: v.to(device=model.device, dtype=torch.bfloat16) for k, v in raw_inputs.items()}
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    # Unpack inputs into generate_kwargs so that each tensor is passed as a separate keyword argument.
-    generate_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    # Launch generation in a separate thread.
-    t = Thread(target=generate_thread, kwargs={"generate_kwargs": generate_kwargs})
     t.start()
     output = ""
@@ -212,7 +202,6 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
         output += delta
         yield output
 examples = [
     [
         {
@@ -339,7 +328,7 @@ DESCRIPTION = """\
 <img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
 This is a demo of Gemma 3 12B it, a vision language model with outstanding performance on a wide range of tasks.
-You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
 """
 demo = gr.ChatInterface(

 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 }
 '''
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     new_image_count, new_video_count = count_files_in_new_message(message["files"])
     history_image_count, history_video_count = count_files_in_history(history)
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    max_frames = 5  # Limit to 5 frames to prevent memory overload
+    if total_frames <= max_frames:
+        indices = list(range(total_frames))
+    else:
+        indices = [int(i * (total_frames - 1) / (max_frames - 1)) for i in range(max_frames)]
     frames = []
+    for i in indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
     logger.debug(f"{content=}")
     return content
 def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
     logger.debug(f"{content=}")
     return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
     ]
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
     current_user_content: list[dict] = []
                 current_user_content.append({"type": "text", "text": content})
             else:
                 current_user_content.append({"type": "image", "url": content[0]})
     return messages
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
+    inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
+    ).to(device=model.device, dtype=torch.bfloat16)
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     output = ""
         output += delta
         yield output
 examples = [
     [
         {
 <img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
 This is a demo of Gemma 3 12B it, a vision language model with outstanding performance on a wide range of tasks.
+You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input. For videos, up to 5 frames will be extracted and processed.
 """
 demo = gr.ChatInterface(