import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer from threading import Thread import re import time import torch import spaces import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) subprocess.run("pip install av", shell=True) # Install pyav for video processing from io import BytesIO processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct") model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", _attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16).to("cuda:0") @spaces.GPU def model_inference(input_dict, history, max_tokens): text = input_dict["text"] media_queue = [] user_content = [] for file in input_dict.get("files", []): if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")): media_queue.append({"type": "image", "path": file}) elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")): media_queue.append({"type": "video", "path": file}) if "" in text or "