Qwen2-VL-2B

Running on Zero

App Files Files Community

Qwen2-VL-2B / app.py

KingNish

Added streaming output and error handling

b9caa33 verified 10 months ago

raw

history blame

4.56 kB

	import gradio as gr
	import spaces
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
	from qwen_vl_utils import process_vision_info
	import torch
	from PIL import Image
	import subprocess
	import numpy as np
	import os
	from threading import Thread
	import uuid
	import io

	# Model and Processor Loading (Done once at startup)
	MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to("cuda").eval()
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

	DESCRIPTION = "[Qwen2-VL-2B Demo](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)"

	image_extensions = Image.registered_extensions()
	video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")


	def identify_and_save_blob(blob_path):
	"""Identifies if the blob is an image or video and saves it accordingly."""
	try:
	with open(blob_path, 'rb') as file:
	blob_content = file.read()

	# Try to identify if it's an image
	try:
	Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
	extension = ".png" # Default to PNG for saving
	media_type = "image"
	except (IOError, SyntaxError):
	# If it's not a valid image, assume it's a video
	extension = ".mp4" # Default to MP4 for saving
	media_type = "video"

	# Create a unique filename
	filename = f"temp_{uuid.uuid4()}_media{extension}"
	with open(filename, "wb") as f:
	f.write(blob_content)

	return filename, media_type

	except FileNotFoundError:
	raise ValueError(f"The file {blob_path} was not found.")
	except Exception as e:
	raise ValueError(f"An error occurred while processing the file: {e}")


	@spaces.GPU
	def qwen_inference(media_input, text_input=None):
	if isinstance(media_input, str): # If it's a filepath
	media_path = media_input
	if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
	media_type = "image"
	elif media_path.endswith(video_extensions):
	media_type = "video"
	else:
	try:
	media_path, media_type = identify_and_save_blob(media_input)
	print(media_path, media_type)
	except Exception as e:
	print(e)
	raise ValueError(
	"Unsupported media type. Please upload an image or video."
	)


	print(media_path)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": media_type,
	media_type: media_path,
	**({"fps": 8.0} if media_type == "video" else {}),
	},
	{"type": "text", "text": text_input},
	],
	}
	]

	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	streamer = TextIteratorStreamer(
	processor, skip_prompt=True, **{"skip_special_tokens": True}
	)
	generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer

	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tab(label="Image/Video Input"):
	with gr.Row():
	with gr.Column():
	input_media = gr.File(
	label="Upload Image or Video", type="filepath"
	)
	text_input = gr.Textbox(label="Question")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	submit_btn.click(
	qwen_inference, [input_media, text_input], [output_text]
	)

	demo.launch(debug=True)