Spaces:

remyxai
/

SpaceThinker-Qwen2.5VL-3B

Running on Zero

App Files Files Community

SpaceThinker-Qwen2.5VL-3B / app.py

salma-remyx

update system prompt

583ea10 3 months ago

raw

history blame contribute delete

4.52 kB

	import spaces
	import torch
	import gradio as gr
	from PIL import Image
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from functools import lru_cache

	MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B"

	@lru_cache(maxsize=1)
	def _load_model():
	"""Load and cache the model and processor inside GPU worker."""
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16
	).to("cuda")
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	return model, processor

	@spaces.GPU
	def gpu_inference(image_path: str, prompt: str) -> str:
	"""Perform inference entirely in GPU subprocess."""
	model, processor = _load_model()

	# Load and preprocess image
	image = Image.open(image_path).convert("RGB")
	if image.width > 512:
	ratio = image.height / image.width
	image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS)

	# Build conversation
	system_msg = (
	"You are VL-Thinking U+1F914, a helpful assistant with excellent reasoning ability.\n"
	"A user asks you a question, and you should try to solve it."
	"You should first think about the reasoning process in the mind and then provides the user with the answer.\n"
	"The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>."
	)
	conversation = [
	{"role": "system", "content": [{"type": "text", "text": system_msg}]},
	{"role": "user", "content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt}
	]}
	]

	# Tokenize, generate, decode
	chat_input = processor.apply_chat_template(
	conversation, tokenize=False, add_generation_prompt=True
	)
	inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda")
	output_ids = model.generate(**inputs, max_new_tokens=1024)
	decoded = processor.batch_decode(
	output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Extract assistant portion
	return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip()

	# Message handling

	def add_message(history, user_input):
	if history is None:
	history = []
	for f in user_input.get("files", []):
	history.append({"role": "user", "content": (f,)})
	text = user_input.get("text", "")
	if text:
	history.append({"role": "user", "content": text})
	return history, gr.MultimodalTextbox(value=None)


	def inference_interface(history):
	if not history:
	return history, gr.MultimodalTextbox(value=None)
	# Last user text
	user_text = next(
	(m["content"] for m in reversed(history)
	if m["role"] == "user" and isinstance(m["content"], str)),
	None
	)
	if user_text is None:
	return history, gr.MultimodalTextbox(value=None)
	# Last user image
	image_path = next(
	(m["content"][0] for m in reversed(history)
	if m["role"] == "user" and isinstance(m["content"], tuple)),
	None
	)
	if image_path is None:
	return history, gr.MultimodalTextbox(value=None)

	# GPU inference
	reply = gpu_inference(image_path, user_text)
	history.append({"role": "assistant", "content": reply})
	return history, gr.MultimodalTextbox(value=None)


	def build_demo():
	with gr.Blocks() as demo:
	gr.Markdown("# SpaceThinker-Qwen2.5VL-3B")
	chatbot = gr.Chatbot([], type="messages", label="Conversation")
	chat_input = gr.MultimodalTextbox(
	interactive=True,
	file_types=["image"],
	placeholder="Enter text and upload an image.",
	show_label=True
	)
	submit_evt = chat_input.submit(
	add_message, [chatbot, chat_input], [chatbot, chat_input]
	)
	submit_evt.then(
	inference_interface, [chatbot], [chatbot, chat_input]
	)
	with gr.Row():
	send_btn = gr.Button("Send")
	clear_btn = gr.ClearButton([chatbot, chat_input])
	send_click = send_btn.click(
	add_message, [chatbot, chat_input], [chatbot, chat_input]
	)
	send_click.then(
	inference_interface, [chatbot], [chatbot, chat_input]
	)
	return demo


	if __name__ == "__main__":
	demo = build_demo()
	demo.launch(share=True)