import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from PIL import Image import cv2 import numpy as np import gradio as gr # Check GPU availability if not torch.cuda.is_available(): raise RuntimeError("This application requires a GPU to run. No GPU detected.") # Load the model and processor def load_model(): try: model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float16 # Use float16 for GPU ).to("cuda") # Explicitly use CUDA processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") return model, processor except RuntimeError as e: print(f"Error loading model: {e}") raise try: model, processor = load_model() except Exception as e: print(f"Failed to load model: {e}") raise def process_image(image): try: messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to("cuda") # Explicitly use CUDA with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=256) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] except Exception as e: return f"An error occurred while processing the image: {str(e)}" def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224): try: cap = cv2.VideoCapture(video_path) frames = [] frame_count = 0 while len(frames) < max_frames: ret, frame = cap.read() if not ret: break if frame_count % frame_interval == 0: h, w = frame.shape[:2] if h > w: new_h, new_w = max_resolution, int(w * max_resolution / h) else: new_h, new_w = int(h * max_resolution / w), max_resolution frame = cv2.resize(frame, (new_w, new_h)) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) frames.append(frame) frame_count += 1 cap.release() messages = [ { "role": "user", "content": [ {"type": "video", "video": frames}, {"type": "text", "text": "Describe this video."}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to("cuda") # Explicitly use CUDA with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=256) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] except Exception as e: return f"An error occurred while processing the video: {str(e)}" def process_content(content): if content is None: return "Please upload an image or video file." try: if content.name.lower().endswith(('.png', '.jpg', '.jpeg')): return process_image(Image.open(content.name)) elif content.name.lower().endswith(('.mp4', '.avi', '.mov')): return process_video(content.name) else: return "Unsupported file type. Please provide an image or video file." except Exception as e: return f"An error occurred while processing the content: {str(e)}" # Gradio interface iface = gr.Interface( fn=process_content, inputs=gr.File(label="Upload Image or Video"), outputs="text", title="Image and Video Description (GPU Version)", description="Upload an image or video to get a description. This application requires GPU computation.", ) if __name__ == "__main__": iface.launch()