Doc-VLMs-OCR / app.py
prithivMLmods's picture
Update app.py
0b5bfb4 verified
raw
history blame
3.18 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run("pip install av", shell=True) # Install pyav for video processing
from io import BytesIO
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
_attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16).to("cuda:0")
@spaces.GPU
def model_inference(input_dict, history, max_tokens):
text = input_dict["text"]
media_queue = []
user_content = []
for file in input_dict.get("files", []):
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
media_queue.append({"type": "image", "path": file})
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
media_queue.append({"type": "video", "path": file})
if "<image>" in text or "<video>" in text:
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
else:
user_content.append({"type": "text", "text": text})
user_content.extend(media_queue)
resulting_messages = [{"role": "user", "content": user_content}]
if not text and not media_queue:
return "Please provide text and/or media input."
inputs = processor.apply_chat_template(
resulting_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
thread = Thread(target=model.generate, kwargs=generation_args)
thread.start()
yield "Generating response..."
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(
fn=model_inference,
title="SmolVLM2: The Smallest Video Model Ever ๐Ÿ“บ",
description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
stop_btn="Stop Generation",
multimodal=True,
cache_examples=False,
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
type="messages"
)
demo.launch(share=True, debug=True)