Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,182 Bytes
b8a0d2d f5ecaf8 8716c2f f5ecaf8 8081540 f5ecaf8 0b5bfb4 8716c2f f5ecaf8 8081540 f5ecaf8 8081540 dec51b2 0b5bfb4 f5ecaf8 0b5bfb4 f5ecaf8 0b5bfb4 f5ecaf8 0b5bfb4 f5ecaf8 0b5bfb4 f5ecaf8 8716c2f f5ecaf8 0b5bfb4 f5ecaf8 8716c2f 0b5bfb4 f5ecaf8 8716c2f f5ecaf8 8716c2f f5ecaf8 b8a0d2d 0b5bfb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run("pip install av", shell=True) # Install pyav for video processing
from io import BytesIO
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
_attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16).to("cuda:0")
@spaces.GPU
def model_inference(input_dict, history, max_tokens):
text = input_dict["text"]
media_queue = []
user_content = []
for file in input_dict.get("files", []):
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
media_queue.append({"type": "image", "path": file})
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
media_queue.append({"type": "video", "path": file})
if "<image>" in text or "<video>" in text:
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
else:
user_content.append({"type": "text", "text": text})
user_content.extend(media_queue)
resulting_messages = [{"role": "user", "content": user_content}]
if not text and not media_queue:
return "Please provide text and/or media input."
inputs = processor.apply_chat_template(
resulting_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
thread = Thread(target=model.generate, kwargs=generation_args)
thread.start()
yield "Generating response..."
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(
fn=model_inference,
title="SmolVLM2: The Smallest Video Model Ever 📺",
description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
stop_btn="Stop Generation",
multimodal=True,
cache_examples=False,
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
type="messages"
)
demo.launch(share=True, debug=True) |