Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

vision_v1 / app.py

whyumesh

Update app.py

a8a3b01 verified 10 months ago

raw

history blame

5.1 kB

	import torch
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from PIL import Image
	import cv2
	import numpy as np
	import gradio as gr

	# Check GPU availability
	if not torch.cuda.is_available():
	raise RuntimeError("This application requires a GPU to run. No GPU detected.")

	# Load the model and processor
	def load_model():
	try:
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-2B-Instruct",
	torch_dtype=torch.float16 # Use float16 for GPU
	).to("cuda") # Explicitly use CUDA
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
	return model, processor
	except RuntimeError as e:
	print(f"Error loading model: {e}")
	raise

	try:
	model, processor = load_model()
	except Exception as e:
	print(f"Failed to load model: {e}")
	raise

	def process_image(image):
	try:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "Describe this image."},
	],
	}
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda") # Explicitly use CUDA

	with torch.no_grad():
	generated_ids = model.generate(**inputs, max_new_tokens=256)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	return output_text[0]
	except Exception as e:
	return f"An error occurred while processing the image: {str(e)}"

	def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
	try:
	cap = cv2.VideoCapture(video_path)
	frames = []
	frame_count = 0

	while len(frames) < max_frames:
	ret, frame = cap.read()
	if not ret:
	break

	if frame_count % frame_interval == 0:
	h, w = frame.shape[:2]
	if h > w:
	new_h, new_w = max_resolution, int(w * max_resolution / h)
	else:
	new_h, new_w = int(h * max_resolution / w), max_resolution
	frame = cv2.resize(frame, (new_w, new_h))
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = Image.fromarray(frame)
	frames.append(frame)

	frame_count += 1

	cap.release()

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "video", "video": frames},
	{"type": "text", "text": "Describe this video."},
	],
	}
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda") # Explicitly use CUDA

	with torch.no_grad():
	generated_ids = model.generate(**inputs, max_new_tokens=256)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	return output_text[0]
	except Exception as e:
	return f"An error occurred while processing the video: {str(e)}"

	def process_content(content):
	if content is None:
	return "Please upload an image or video file."

	try:
	if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
	return process_image(Image.open(content.name))
	elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
	return process_video(content.name)
	else:
	return "Unsupported file type. Please provide an image or video file."
	except Exception as e:
	return f"An error occurred while processing the content: {str(e)}"

	# Gradio interface
	iface = gr.Interface(
	fn=process_content,
	inputs=gr.File(label="Upload Image or Video"),
	outputs="text",
	title="Image and Video Description (GPU Version)",
	description="Upload an image or video to get a description. This application requires GPU computation.",
	)

	if __name__ == "__main__":
	iface.launch()