Spaces:

whyumesh
/

vision_v1

Sleeping

File size: 5,097 Bytes

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import gradio as gr

# Check GPU availability
if not torch.cuda.is_available():
    raise RuntimeError("This application requires a GPU to run. No GPU detected.")

# Load the model and processor
def load_model():
    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct",
            torch_dtype=torch.float16  # Use float16 for GPU
        ).to("cuda")  # Explicitly use CUDA
        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
        return model, processor
    except RuntimeError as e:
        print(f"Error loading model: {e}")
        raise

try:
    model, processor = load_model()
except Exception as e:
    print(f"Failed to load model: {e}")
    raise

def process_image(image):
    try:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Describe this image."},
                ],
            }
        ]

        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to("cuda")  # Explicitly use CUDA

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=256)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return output_text[0]
    except Exception as e:
        return f"An error occurred while processing the image: {str(e)}"

def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
    try:
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = 0

        while len(frames) < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                h, w = frame.shape[:2]
                if h > w:
                    new_h, new_w = max_resolution, int(w * max_resolution / h)
                else:
                    new_h, new_w = int(h * max_resolution / w), max_resolution
                frame = cv2.resize(frame, (new_w, new_h))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
                frames.append(frame)

            frame_count += 1

        cap.release()

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": frames},
                    {"type": "text", "text": "Describe this video."},
                ],
            }
        ]

        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to("cuda")  # Explicitly use CUDA

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=256)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return output_text[0]
    except Exception as e:
        return f"An error occurred while processing the video: {str(e)}"

def process_content(content):
    if content is None:
        return "Please upload an image or video file."

    try:
        if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
            return process_image(Image.open(content.name))
        elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
            return process_video(content.name)
        else:
            return "Unsupported file type. Please provide an image or video file."
    except Exception as e:
        return f"An error occurred while processing the content: {str(e)}"

# Gradio interface
iface = gr.Interface(
    fn=process_content,
    inputs=gr.File(label="Upload Image or Video"),
    outputs="text",
    title="Image and Video Description (GPU Version)",
    description="Upload an image or video to get a description. This application requires GPU computation.",
)

if __name__ == "__main__":
    iface.launch()