File size: 5,097 Bytes
114c949 a8a3b01 114c949 a8a3b01 114c949 a8a3b01 114c949 a8a3b01 114c949 a8a3b01 114c949 a8a3b01 114c949 a8a3b01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import gradio as gr
# Check GPU availability
if not torch.cuda.is_available():
raise RuntimeError("This application requires a GPU to run. No GPU detected.")
# Load the model and processor
def load_model():
try:
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct",
torch_dtype=torch.float16 # Use float16 for GPU
).to("cuda") # Explicitly use CUDA
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
return model, processor
except RuntimeError as e:
print(f"Error loading model: {e}")
raise
try:
model, processor = load_model()
except Exception as e:
print(f"Failed to load model: {e}")
raise
def process_image(image):
try:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe this image."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda") # Explicitly use CUDA
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text[0]
except Exception as e:
return f"An error occurred while processing the image: {str(e)}"
def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
try:
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0
while len(frames) < max_frames:
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
h, w = frame.shape[:2]
if h > w:
new_h, new_w = max_resolution, int(w * max_resolution / h)
else:
new_h, new_w = int(h * max_resolution / w), max_resolution
frame = cv2.resize(frame, (new_w, new_h))
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
frames.append(frame)
frame_count += 1
cap.release()
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": frames},
{"type": "text", "text": "Describe this video."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda") # Explicitly use CUDA
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text[0]
except Exception as e:
return f"An error occurred while processing the video: {str(e)}"
def process_content(content):
if content is None:
return "Please upload an image or video file."
try:
if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
return process_image(Image.open(content.name))
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
return process_video(content.name)
else:
return "Unsupported file type. Please provide an image or video file."
except Exception as e:
return f"An error occurred while processing the content: {str(e)}"
# Gradio interface
iface = gr.Interface(
fn=process_content,
inputs=gr.File(label="Upload Image or Video"),
outputs="text",
title="Image and Video Description (GPU Version)",
description="Upload an image or video to get a description. This application requires GPU computation.",
)
if __name__ == "__main__":
iface.launch() |