Spaces:

prithivMLmods
/

Doc-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

9a23baa

verified ·

1 Parent(s): 7f39c2a

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -137

app.py CHANGED Viewed

@@ -1,138 +1,75 @@
 import gradio as gr
-from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 from threading import Thread
 import re
 import time
 import torch
 import spaces
-import subprocess
-import uuid
-import cv2
-import numpy as np
-from PIL import Image
-from io import BytesIO
-# Install flash-attn
-subprocess.run(
-    'pip install flash-attn --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
-    shell=True
-)
-# Load processor and model.
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
-model = AutoModelForImageTextToText.from_pretrained(
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-    _attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16
-).to("cuda:0")
-def downsample_video(video_path):
-    """
-    Extracts 10 evenly spaced frames from the video at video_path.
-    Each frame is converted from BGR to RGB and returned as a PIL Image.
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, frame = vidcap.read()
-        if success:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(frame)
-            frames.append((pil_image, round(i / fps, 2)))
-    vidcap.release()
-    return frames
 @spaces.GPU
-def model_inference(input_dict, history, max_tokens):
     text = input_dict["text"]
-    user_content = []
-    media_queue = []
-    # Process input files.
-    for file in input_dict.get("files", []):
-        if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
-            media_queue.append({"type": "image", "path": file})
-        elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
-            # Extract frames from video using OpenCV.
-            frames = downsample_video(file)
-            for frame, timestamp in frames:
-                temp_file = f"video_frame_{uuid.uuid4().hex}.png"
-                frame.save(temp_file)
-                media_queue.append({"type": "image", "path": temp_file})
-    # Build the conversation messages.
-    if not history:
-        text = text.strip()
-        # Use only the "<image>" token for inserting images.
-        if "<image>" in text:
-            parts = re.split(r'(<image>)', text)
-            for part in parts:
-                if part == "<image>" and media_queue:
-                    user_content.append(media_queue.pop(0))
-                elif part.strip():
-                    user_content.append({"type": "text", "text": part.strip()})
-        else:
-            user_content.append({"type": "text", "text": text})
-            for media in media_queue:
-                user_content.append(media)
-        resulting_messages = [{"role": "user", "content": user_content}]
     else:
-        resulting_messages = []
-        user_content = []
-        media_queue = []
-        # Process history: now only image files are expected.
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], tuple):
-                file_name = hist["content"][0]
-                if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
-                    media_queue.append({"type": "image", "path": file_name})
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], str):
-                text = hist["content"]
-                parts = re.split(r'(<image>)', text)
-                for part in parts:
-                    if part == "<image>" and media_queue:
-                        user_content.append(media_queue.pop(0))
-                    elif part.strip():
-                        user_content.append({"type": "text", "text": part.strip()})
-            elif hist["role"] == "assistant":
-                resulting_messages.append({
-                    "role": "user",
-                    "content": user_content
-                })
-                resulting_messages.append({
-                    "role": "assistant",
-                    "content": [{"type": "text", "text": hist["content"]}]
-                })
-                user_content = []
-    if text == "":
         gr.Error("Please input a query and optionally image(s).")
-    print("resulting_messages", resulting_messages)
-    inputs = processor.apply_chat_template(
-        resulting_messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(model.device)
-    # Generate response with streaming.
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
     thread = Thread(target=model.generate, kwargs=generation_args)
     thread.start()
     yield "..."
     buffer = ""
     for new_text in streamer:
@@ -140,30 +77,58 @@ def model_inference(input_dict, history, max_tokens):
         time.sleep(0.01)
         yield buffer
-examples = [
-    [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
-    [{"text": "What art era does this artpiece <image> belong to?", "files": ["example_images/rococo.jpg"]}],
-    [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
-    [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
-    [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
-    [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
-]
 demo = gr.ChatInterface(
     fn=model_inference,
-    title="SmolVLM2: The Smollest Video Model Ever 📺",
-    description=(
-        "Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. "
-        "To get started, upload an image and text or try one of the examples. "
-        "This demo doesn't use history for the chat, so every chat you start is a new conversation."
-    ),
-    examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
-    cache_examples=False,
-    additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
-    type="messages"
 )
-demo.launch(debug=True)

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 from threading import Thread
 import re
 import time
+from PIL import Image
 import torch
 import spaces
+# Load processor and model
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
+model = AutoModelForVision2Seq.from_pretrained(
+    "HuggingFaceTB/SmolVLM-Instruct",
+    torch_dtype=torch.bfloat16,
+).to("cuda")
 @spaces.GPU
+def model_inference(
+    input_dict, history, decoding_strategy, temperature, max_new_tokens,
+    repetition_penalty, top_p
+):
     text = input_dict["text"]
+    print(input_dict["files"])
+    # Process input images if provided.
+    if len(input_dict["files"]) > 1:
+        images = [Image.open(image).convert("RGB") for image in input_dict["files"]]
+    elif len(input_dict["files"]) == 1:
+        images = [Image.open(input_dict["files"][0]).convert("RGB")]
     else:
+        images = []
+    # Validate input
+    if text == "" and not images:
         gr.Error("Please input a query and optionally image(s).")
+    if text == "" and images:
+        gr.Error("Please input a text query along with the image(s).")
+    # Prepare prompt using the chat template.
+    resulting_messages = [{
+        "role": "user",
+        "content": [{"type": "image"} for _ in range(len(images))] + [
+            {"type": "text", "text": text}
+        ]
+    }]
+    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[images], return_tensors="pt")
+    inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    # Setup generation parameters.
+    generation_args = {
+        "max_new_tokens": max_new_tokens,
+        "repetition_penalty": repetition_penalty,
+    }
+    assert decoding_strategy in ["Greedy", "Top P Sampling"]
+    if decoding_strategy == "Greedy":
+        generation_args["do_sample"] = False
+    elif decoding_strategy == "Top P Sampling":
+        generation_args["temperature"] = temperature
+        generation_args["do_sample"] = True
+        generation_args["top_p"] = top_p
+    generation_args.update(inputs)
+    # Generate output with a streaming approach.
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
+    generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_args)
     thread.start()
     yield "..."
     buffer = ""
     for new_text in streamer:
         time.sleep(0.01)
         yield buffer
+# Define the ChatInterface without examples.
 demo = gr.ChatInterface(
     fn=model_inference,
+    title="SmolVLM: Small yet Mighty 💫",
+    description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text.",
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
+    additional_inputs=[
+        gr.Radio(
+            ["Top P Sampling", "Greedy"],
+            value="Greedy",
+            label="Decoding strategy",
+            info="Higher values is equivalent to sampling more low-probability tokens.",
+        ),
+        gr.Slider(
+            minimum=0.0,
+            maximum=5.0,
+            value=0.4,
+            step=0.1,
+            interactive=True,
+            label="Sampling temperature",
+            info="Higher values will produce more diverse outputs.",
+        ),
+        gr.Slider(
+            minimum=8,
+            maximum=1024,
+            value=512,
+            step=1,
+            interactive=True,
+            label="Maximum number of new tokens to generate",
+        ),
+        gr.Slider(
+            minimum=0.01,
+            maximum=5.0,
+            value=1.2,
+            step=0.01,
+            interactive=True,
+            label="Repetition penalty",
+            info="1.0 is equivalent to no penalty",
+        ),
+        gr.Slider(
+            minimum=0.01,
+            maximum=0.99,
+            value=0.8,
+            step=0.01,
+            interactive=True,
+            label="Top P",
+            info="Higher values is equivalent to sampling more low-probability tokens.",
+        )
+    ],
+    cache_examples=False
 )
+demo.launch(debug=True)