Spaces:

remyxai
/

SpaceThinker-Qwen2.5VL-3B

Running on Zero

File size: 4,519 Bytes

a8b0636
80b7578
ebd9056
80b7578
 
eaa703f
ebd9056
80b7578
0d09a3a
eaa703f
e19b349
 
80b7578
 
e19b349
 
80b7578
 
 
e19b349
 
 
 
 
 
 
 
 
 
 
 
80b7578
583ea10
 
 
 
80b7578
 
e19b349
 
 
 
 
80b7578
e19b349
 
 
80b7578
 
e19b349
 
 
 
 
 
 
 
0d09a3a
e19b349
0d09a3a
80b7578
e19b349
80b7578
e19b349
707a904
e19b349
80b7578
707a904
80b7578
ebd9056
e19b349
80b7578
 
 
e19b349
 
 
 
 
 
 
80b7578
e19b349
 
 
 
 
 
 
80b7578
ebd9056
e19b349
 
 
80b7578
3e7a2b7
e19b349
3e7a2b7
22fc8c6
bbbd1e2
e19b349
80b7578
 
 
 
707a904
0d09a3a
e19b349
 
80b7578
e19b349
 
0d09a3a
22fc8c6
e19b349
 
 
 
80b7578
 
e19b349
3e7a2b7
ebd9056
 
e19b349
0d09a3a
3e7a2b7
0d09a3a
80b7578

import spaces
import torch
import gradio as gr
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from functools import lru_cache

MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B"

@lru_cache(maxsize=1)
def _load_model():
    """Load and cache the model and processor inside GPU worker."""
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16
    ).to("cuda")
    processor = AutoProcessor.from_pretrained(MODEL_ID)
    return model, processor

@spaces.GPU
def gpu_inference(image_path: str, prompt: str) -> str:
    """Perform inference entirely in GPU subprocess."""
    model, processor = _load_model()

    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")
    if image.width > 512:
        ratio = image.height / image.width
        image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS)

    # Build conversation
    system_msg = (
            "You are VL-Thinking U+1F914, a helpful assistant with excellent reasoning ability.\n"
            "A user asks you a question, and you should try to solve it."
            "You should first think about the reasoning process in the mind and then provides the user with the answer.\n"
            "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>."
    )
    conversation = [
        {"role": "system", "content": [{"type": "text", "text": system_msg}]},
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]}
    ]

    # Tokenize, generate, decode
    chat_input = processor.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    decoded = processor.batch_decode(
        output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    # Extract assistant portion
    return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip()

# Message handling

def add_message(history, user_input):
    if history is None:
        history = []
    for f in user_input.get("files", []):
        history.append({"role": "user", "content": (f,)})
    text = user_input.get("text", "")
    if text:
        history.append({"role": "user", "content": text})
    return history, gr.MultimodalTextbox(value=None)


def inference_interface(history):
    if not history:
        return history, gr.MultimodalTextbox(value=None)
    # Last user text
    user_text = next(
        (m["content"] for m in reversed(history)
         if m["role"] == "user" and isinstance(m["content"], str)),
        None
    )
    if user_text is None:
        return history, gr.MultimodalTextbox(value=None)
    # Last user image
    image_path = next(
        (m["content"][0] for m in reversed(history)
         if m["role"] == "user" and isinstance(m["content"], tuple)),
        None
    )
    if image_path is None:
        return history, gr.MultimodalTextbox(value=None)

    # GPU inference
    reply = gpu_inference(image_path, user_text)
    history.append({"role": "assistant", "content": reply})
    return history, gr.MultimodalTextbox(value=None)


def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# SpaceThinker-Qwen2.5VL-3B")
        chatbot = gr.Chatbot([], type="messages", label="Conversation")
        chat_input = gr.MultimodalTextbox(
            interactive=True,
            file_types=["image"],
            placeholder="Enter text and upload an image.",
            show_label=True
        )
        submit_evt = chat_input.submit(
            add_message, [chatbot, chat_input], [chatbot, chat_input]
        )
        submit_evt.then(
            inference_interface, [chatbot], [chatbot, chat_input]
        )
        with gr.Row():
            send_btn = gr.Button("Send")
            clear_btn = gr.ClearButton([chatbot, chat_input])
        send_click = send_btn.click(
            add_message, [chatbot, chat_input], [chatbot, chat_input]
        )
        send_click.then(
            inference_interface, [chatbot], [chatbot, chat_input]
        )
    return demo


if __name__ == "__main__":
    demo = build_demo()
    demo.launch(share=True)