File size: 3,182 Bytes
b8a0d2d
f5ecaf8
8716c2f
f5ecaf8
8081540
f5ecaf8
 
 
 
0b5bfb4
8716c2f
f5ecaf8
8081540
f5ecaf8
 
 
 
8081540
dec51b2
0b5bfb4
f5ecaf8
 
0b5bfb4
 
 
 
 
 
 
f5ecaf8
0b5bfb4
 
 
 
 
 
 
 
 
 
 
 
f5ecaf8
0b5bfb4
 
 
 
 
f5ecaf8
0b5bfb4
 
 
 
 
 
f5ecaf8
8716c2f
f5ecaf8
0b5bfb4
f5ecaf8
8716c2f
0b5bfb4
 
f5ecaf8
 
8716c2f
f5ecaf8
8716c2f
f5ecaf8
 
b8a0d2d
0b5bfb4
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run("pip install av", shell=True)  # Install pyav for video processing

from io import BytesIO

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", 
                                               _attn_implementation="flash_attention_2",
                                               torch_dtype=torch.bfloat16).to("cuda:0")

@spaces.GPU
def model_inference(input_dict, history, max_tokens): 
    text = input_dict["text"]
    media_queue = []
    user_content = []
    
    for file in input_dict.get("files", []):
        if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
            media_queue.append({"type": "image", "path": file})
        elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
            media_queue.append({"type": "video", "path": file})

    if "<image>" in text or "<video>" in text:
        parts = re.split(r'(<image>|<video>)', text)  
        for part in parts:
            if part == "<image>" and media_queue:
                user_content.append(media_queue.pop(0)) 
            elif part == "<video>" and media_queue:
                user_content.append(media_queue.pop(0))  
            elif part.strip():  
                user_content.append({"type": "text", "text": part.strip()})
    else:
        user_content.append({"type": "text", "text": text})
        user_content.extend(media_queue)

    resulting_messages = [{"role": "user", "content": user_content}]
    
    if not text and not media_queue:
        return "Please provide text and/or media input."
    
    inputs = processor.apply_chat_template(
        resulting_messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)
    
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
    
    thread = Thread(target=model.generate, kwargs=generation_args)
    thread.start()
    
    yield "Generating response..."
    buffer = ""
    
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer


demo = gr.ChatInterface(
    fn=model_inference,
    title="SmolVLM2: The Smallest Video Model Ever 📺", 
    description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
    additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
    type="messages"
)

demo.launch(share=True, debug=True)