import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run("pip install av", shell=True)  # Install pyav for video processing

from io import BytesIO

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", 
                                               _attn_implementation="flash_attention_2",
                                               torch_dtype=torch.bfloat16).to("cuda:0")

@spaces.GPU
def model_inference(input_dict, history, max_tokens): 
    text = input_dict["text"]
    media_queue = []
    user_content = []
    
    for file in input_dict.get("files", []):
        if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
            media_queue.append({"type": "image", "path": file})
        elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
            media_queue.append({"type": "video", "path": file})

    if "<image>" in text or "<video>" in text:
        parts = re.split(r'(<image>|<video>)', text)  
        for part in parts:
            if part == "<image>" and media_queue:
                user_content.append(media_queue.pop(0)) 
            elif part == "<video>" and media_queue:
                user_content.append(media_queue.pop(0))  
            elif part.strip():  
                user_content.append({"type": "text", "text": part.strip()})
    else:
        user_content.append({"type": "text", "text": text})
        user_content.extend(media_queue)

    resulting_messages = [{"role": "user", "content": user_content}]
    
    if not text and not media_queue:
        return "Please provide text and/or media input."
    
    inputs = processor.apply_chat_template(
        resulting_messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)
    
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
    
    thread = Thread(target=model.generate, kwargs=generation_args)
    thread.start()
    
    yield "Generating response..."
    buffer = ""
    
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer


demo = gr.ChatInterface(
    fn=model_inference,
    title="SmolVLM2: The Smallest Video Model Ever 📺", 
    description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
    additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
    type="messages"
)

demo.launch(share=True, debug=True)