Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,370 Bytes
b8a0d2d f5ecaf8 8716c2f f5ecaf8 8081540 f5ecaf8 d9dde0d f5ecaf8 8716c2f f5ecaf8 8081540 f5ecaf8 8081540 dec51b2 d9dde0d f5ecaf8 d9dde0d 0b5bfb4 d9dde0d f5ecaf8 d9dde0d f5ecaf8 d9dde0d f5ecaf8 0b5bfb4 d9dde0d f5ecaf8 d9dde0d 8716c2f f5ecaf8 d9dde0d f5ecaf8 8716c2f d9dde0d f5ecaf8 8716c2f f5ecaf8 8716c2f f5ecaf8 d9dde0d b8a0d2d d9dde0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
# Ensure pyav is installed
subprocess.run('pip install pyav', shell=True, check=True)
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
from io import BytesIO
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
_attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16).to("cuda:0")
@spaces.GPU
def model_inference(
input_dict, history, max_tokens
):
text = input_dict["text"]
images = []
user_content = []
media_queue = []
if history == []:
text = input_dict["text"].strip()
for file in input_dict.get("files", []):
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
media_queue.append({"type": "image", "path": file})
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
media_queue.append({"type": "video", "path": file})
if "<image>" in text or "<video>" in text:
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
else:
user_content.append({"type": "text", "text": text})
for media in media_queue:
user_content.append(media)
resulting_messages = [{"role": "user", "content": user_content}]
elif len(history) > 0:
resulting_messages = []
user_content = []
media_queue = []
for hist in history:
if hist["role"] == "user" and isinstance(hist["content"], tuple):
file_name = hist["content"][0]
if file_name.endswith((".png", ".jpg", ".jpeg")):
media_queue.append({"type": "image", "path": file_name})
elif file_name.endswith(".mp4"):
media_queue.append({"type": "video", "path": file_name})
for hist in history:
if hist["role"] == "user" and isinstance(hist["content"], str):
text = hist["content"]
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
elif hist["role"] == "assistant":
resulting_messages.append({
"role": "user",
"content": user_content
})
resulting_messages.append({
"role": "assistant",
"content": [{"type": "text", "text": hist["content"]}]
})
user_content = []
if text == "" and not images:
gr.Error("Please input a query and optionally image(s).")
if text == "" and images:
gr.Error("Please input a text query along the images(s).")
print("resulting_messages", resulting_messages)
inputs = processor.apply_chat_template(
resulting_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Generate
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_args)
thread.start()
yield "..."
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
cache_examples=False,
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
type="messages"
)
demo.launch(debug=True, share=True) |