Spaces:

echarlaix
/

vision-langage-openvino

Running on CPU Upgrade

App Files Files Community

echarlaix HF Staff commited on Jul 3

Commit

175c9ee

1 Parent(s): bc84cb4

update

Browse files

Files changed (4) hide show

app.py +202 -133
assets/cat.jpeg +0 -0
assets/holding_phone.mp4 +3 -0
style.css +4 -0

app.py CHANGED Viewed

@@ -1,177 +1,246 @@
-import gradio as gr
-from transformers import (
-    AutoProcessor,
-    AutoModelForImageTextToText,
-    TextIteratorStreamer,
-)
 from threading import Thread
-import re
-import time
 from optimum.intel import OVModelForVisualCausalLM
 # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
-# model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
-model_id = "echarlaix/SmolVLM2-500M-Video-Instruct-openvino"
 processor = AutoProcessor.from_pretrained(model_id)
 model = OVModelForVisualCausalLM.from_pretrained(model_id)
-def model_inference(input_dict, history, max_tokens):
-    text = input_dict["text"]
-    images = []
-    user_content = []
-    media_queue = []
-    if history == []:
-        text = input_dict["text"].strip()
-        for file in input_dict.get("files", []):
-            if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
-                media_queue.append({"type": "image", "path": file})
-            elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
-                media_queue.append({"type": "video", "path": file})
-        if "<image>" in text or "<video>" in text:
-            parts = re.split(r"(<image>|<video>)", text)
-            for part in parts:
-                if part == "<image>" and media_queue:
-                    user_content.append(media_queue.pop(0))
-                elif part == "<video>" and media_queue:
-                    user_content.append(media_queue.pop(0))
-                elif part.strip():
-                    user_content.append({"type": "text", "text": part.strip()})
         else:
-            user_content.append({"type": "text", "text": text})
-            for media in media_queue:
-                user_content.append(media)
-        resulting_messages = [{"role": "user", "content": user_content}]
-    elif len(history) > 0:
-        resulting_messages = []
-        user_content = []
-        media_queue = []
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], tuple):
-                file_name = hist["content"][0]
-            if file_name.endswith((".png", ".jpg", ".jpeg")):
-                media_queue.append({"type": "image", "path": file_name})
-            elif file_name.endswith(".mp4"):
-                media_queue.append({"type": "video", "path": file_name})
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], str):
-                text = hist["content"]
-                parts = re.split(r"(<image>|<video>)", text)
-                for part in parts:
-                    if part == "<image>" and media_queue:
-                        user_content.append(media_queue.pop(0))
-                    elif part == "<video>" and media_queue:
-                        user_content.append(media_queue.pop(0))
-                    elif part.strip():
-                        user_content.append({"type": "text", "text": part.strip()})
-            elif hist["role"] == "assistant":
-                resulting_messages.append({"role": "user", "content": user_content})
-                resulting_messages.append(
-                    {
-                        "role": "assistant",
-                        "content": [{"type": "text", "text": hist["content"]}],
-                    }
-                )
-                user_content = []
-    if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
-    if text == "" and images:
-        gr.Error("Please input a text query along the images(s).")
-    # print("resulting_messages", resulting_messages)
     inputs = processor.apply_chat_template(
-        resulting_messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
     )
-    # Generate
-    streamer = TextIteratorStreamer(
-        processor, skip_prompt=True, skip_special_tokens=True
     )
-    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
-    # generated_text = ""
-    thread = Thread(target=model.generate, kwargs=generation_args)
-    thread.start()
-    yield "..."
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        # generated_text_without_prompt = buffer#[len(ext_buffer):]
-        time.sleep(0.01)
-        yield buffer
 examples = [
     [
         {
-            "text": "Where do the severe droughts happen according to this diagram?",
-            "files": ["example_images/examples_weather_events.png"],
-        }
-    ],
-    [
-        {
-            "text": "What art era this artpiece <image> and this artpiece <image> belong to?",
-            "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"],
-        }
-    ],
-    [   {
-            "text": "Describe this image.",
-            "files": ["example_images/mosque.jpg"]
-        }
-    ],
-    [
-        {
-            "text": "When was this purchase made and how much did it cost?",
-            "files": ["example_images/fiche.jpg"],
         }
     ],
     [
         {
-            "text": "What is the date in this document?",
-            "files": ["example_images/document.jpg"],
         }
     ],
     [
         {
-            "text": "What is happening in the video?",
-            "files": ["example_images/short.mp4"],
         }
     ],
 ]
 demo = gr.ChatInterface(
-    fn=model_inference,
-    title="SmolVLM2: The Smollest Video Model Ever 📺",
-    description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
-    examples=examples,
     textbox=gr.MultimodalTextbox(
-        label="Query Input", file_types=["image", ".mp4"], file_count="multiple"
     ),
-    stop_btn="Stop Generation",
     multimodal=True,
-    cache_examples=False,
     additional_inputs=[
-        gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")
     ],
-    type="messages",
 )
-demo.launch(debug=True)

+import os
+import pathlib
+import tempfile
+from collections.abc import Iterator
 from threading import Thread
+import av
+import gradio as gr
+import torch
+from gradio.utils import get_upload_folder
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from transformers.generation.streamers import TextIteratorStreamer
 from optimum.intel import OVModelForVisualCausalLM
 # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
+model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
+# model_id = "echarlaix/SmolVLM2-500M-Video-Instruct-openvino"
 processor = AutoProcessor.from_pretrained(model_id)
 model = OVModelForVisualCausalLM.from_pretrained(model_id)
+IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
+VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
+GRADIO_TEMP_DIR = get_upload_folder()
+TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
+MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))
+def get_file_type(path: str) -> str:
+    if path.endswith(IMAGE_FILE_TYPES):
+        return "image"
+    if path.endswith(VIDEO_FILE_TYPES):
+        return "video"
+    error_message = f"Unsupported file type: {path}"
+    raise ValueError(error_message)
+def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
+    video_count = 0
+    non_video_count = 0
+    for path in paths:
+        if path.endswith(VIDEO_FILE_TYPES):
+            video_count += 1
+        else:
+            non_video_count += 1
+    return video_count, non_video_count
+def validate_media_constraints(message: dict) -> bool:
+    video_count, non_video_count = count_files_in_new_message(message["files"])
+    if video_count > 1:
+        gr.Warning("Only one video is supported.")
+        return False
+    if video_count == 1 and non_video_count > 0:
+        gr.Warning("Mixing images and videos is not allowed.")
+        return False
+    return True
+def extract_frames_to_tempdir(
+    video_path: str,
+    target_fps: float,
+    max_frames: int | None = None,
+    parent_dir: str | None = None,
+    prefix: str = "frames_",
+) -> str:
+    temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)
+    container = av.open(video_path)
+    video_stream = container.streams.video[0]
+    if video_stream.duration is None or video_stream.time_base is None:
+        raise ValueError("video_stream is missing duration or time_base")
+    time_base = video_stream.time_base
+    duration = float(video_stream.duration * time_base)
+    interval = 1.0 / target_fps
+    total_frames = int(duration * target_fps)
+    if max_frames is not None:
+        total_frames = min(total_frames, max_frames)
+    target_times = [i * interval for i in range(total_frames)]
+    target_index = 0
+    for frame in container.decode(video=0):
+        if frame.pts is None:
+            continue
+        timestamp = float(frame.pts * time_base)
+        if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
+            frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
+            frame.to_image().save(frame_path)
+            target_index += 1
+            if max_frames is not None and target_index >= max_frames:
+                break
+    container.close()
+    return temp_dir
+def process_new_user_message(message: dict) -> list[dict]:
+    if not message["files"]:
+        return [{"type": "text", "text": message["text"]}]
+    file_types = [get_file_type(path) for path in message["files"]]
+    if len(file_types) == 1 and file_types[0] == "video":
+        gr.Info(f"Video will be processed at {TARGET_FPS} FPS, max {MAX_FRAMES} frames in this Space.")
+        temp_dir = extract_frames_to_tempdir(
+            message["files"][0],
+            target_fps=TARGET_FPS,
+            max_frames=MAX_FRAMES,
+            parent_dir=GRADIO_TEMP_DIR,
+        )
+        paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
+        return [
+            {"type": "text", "text": message["text"]},
+            *[{"type": "image", "image": path.as_posix()} for path in paths],
+        ]
+    return [
+        {"type": "text", "text": message["text"]},
+        *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
+    ]
+def process_history(history: list[dict]) -> list[dict]:
+    messages = []
+    current_user_content: list[dict] = []
+    for item in history:
+        if item["role"] == "assistant":
+            if current_user_content:
+                messages.append({"role": "user", "content": current_user_content})
+                current_user_content = []
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
         else:
+            content = item["content"]
+            if isinstance(content, str):
+                current_user_content.append({"type": "text", "text": content})
+            else:
+                filepath = content[0]
+                file_type = get_file_type(filepath)
+                current_user_content.append({"type": file_type, file_type: filepath})
+    return messages
+@torch.inference_mode()
+def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
+    if not validate_media_constraints(message):
+        yield ""
+        return
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
+    messages.extend(process_history(history))
+    messages.append({"role": "user", "content": process_new_user_message(message)})
     inputs = processor.apply_chat_template(
+        messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors="pt",
     )
+    n_tokens = inputs["input_ids"].shape[1]
+    if n_tokens > MAX_INPUT_TOKENS:
+        gr.Warning(
+            f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
+        )
+        yield ""
+        return
+    # inputs = inputs.to(device=model.device, dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        disable_compile=True,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    output = ""
+    for delta in streamer:
+        output += delta
+        yield output
 examples = [
     [
         {
+            "text": "What is the capital of France?",
+            "files": [],
         }
     ],
     [
         {
+            "text": "Describe this image in detail.",
+            "files": ["assets/cat.jpeg"],
         }
     ],
     [
         {
+            "text": "Describe this video",
+            "files": ["assets/holding_phone.mp4"],
         }
     ],
 ]
 demo = gr.ChatInterface(
+    fn=generate,
+    type="messages",
     textbox=gr.MultimodalTextbox(
+        file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES),
+        file_count="multiple",
+        autofocus=True,
     ),
     multimodal=True,
     additional_inputs=[
+        gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
+        gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
+    stop_btn=False,
+    title="OV model",
+    examples=examples,
+    run_examples_on_click=False,
+    cache_examples=False,
+    css_paths="style.css",
+    delete_cache=(1800, 1800),
 )
+if __name__ == "__main__":
+    demo.launch()

assets/cat.jpeg ADDED Viewed

assets/holding_phone.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a6ae1c4a066dc5d8940069a709c8e8bb63a6d013d4444fec5d34cf94ffd474
+size 11476815

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+  display: block;
+}