import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from PIL import Image import cv2 import numpy as np import gradio as gr # Load the model and processor def load_model(): model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float16 ).to("cuda" if torch.cuda.is_available() else "cpu") processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") return model, processor model, processor = load_model() def process_image(image): messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=256) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224): cap = cv2.VideoCapture(video_path) frames = [] frame_count = 0 while len(frames) < max_frames: ret, frame = cap.read() if not ret: break if frame_count % frame_interval == 0: h, w = frame.shape[:2] if h > w: new_h, new_w = max_resolution, int(w * max_resolution / h) else: new_h, new_w = int(h * max_resolution / w), max_resolution frame = cv2.resize(frame, (new_w, new_h)) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) frames.append(frame) frame_count += 1 cap.release() messages = [ { "role": "user", "content": [ {"type": "video", "video": frames}, {"type": "text", "text": "Describe this video."}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=256) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] def process_content(content): if content is None: return "Please upload an image or video file." if content.name.lower().endswith(('.png', '.jpg', '.jpeg')): return process_image(Image.open(content.name)) elif content.name.lower().endswith(('.mp4', '.avi', '.mov')): return process_video(content.name) else: return "Unsupported file type. Please provide an image or video file." # Gradio interface iface = gr.Interface( fn=process_content, inputs=gr.File(label="Upload Image or Video"), outputs="text", title="Image and Video Description", description="Upload an image or video to get a description.", ) if __name__ == "__main__": iface.launch()