vision_v1 / app.py
whyumesh's picture
Update app.py
a8a3b01 verified
raw
history blame
5.1 kB
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import gradio as gr
# Check GPU availability
if not torch.cuda.is_available():
raise RuntimeError("This application requires a GPU to run. No GPU detected.")
# Load the model and processor
def load_model():
try:
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct",
torch_dtype=torch.float16 # Use float16 for GPU
).to("cuda") # Explicitly use CUDA
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
return model, processor
except RuntimeError as e:
print(f"Error loading model: {e}")
raise
try:
model, processor = load_model()
except Exception as e:
print(f"Failed to load model: {e}")
raise
def process_image(image):
try:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe this image."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda") # Explicitly use CUDA
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text[0]
except Exception as e:
return f"An error occurred while processing the image: {str(e)}"
def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
try:
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0
while len(frames) < max_frames:
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
h, w = frame.shape[:2]
if h > w:
new_h, new_w = max_resolution, int(w * max_resolution / h)
else:
new_h, new_w = int(h * max_resolution / w), max_resolution
frame = cv2.resize(frame, (new_w, new_h))
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
frames.append(frame)
frame_count += 1
cap.release()
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": frames},
{"type": "text", "text": "Describe this video."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda") # Explicitly use CUDA
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text[0]
except Exception as e:
return f"An error occurred while processing the video: {str(e)}"
def process_content(content):
if content is None:
return "Please upload an image or video file."
try:
if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
return process_image(Image.open(content.name))
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
return process_video(content.name)
else:
return "Unsupported file type. Please provide an image or video file."
except Exception as e:
return f"An error occurred while processing the content: {str(e)}"
# Gradio interface
iface = gr.Interface(
fn=process_content,
inputs=gr.File(label="Upload Image or Video"),
outputs="text",
title="Image and Video Description (GPU Version)",
description="Upload an image or video to get a description. This application requires GPU computation.",
)
if __name__ == "__main__":
iface.launch()