Spaces:

Segizu
/

Computer_Vision

Sleeping

File size: 7,167 Bytes

import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
import spaces
import cv2
import numpy as np
import tempfile

@spaces.GPU
def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
    if input_type == "Image":
        if image is None:
            width, height = 640, 480
            blank_image = Image.new("RGB", (width, height), color="white")
            draw = ImageDraw.Draw(blank_image)
            message = "No image provided"
            font = ImageFont.load_default(size=40)
            bbox = draw.textbbox((0, 0), message, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]
            text_x = (width - text_width) / 2
            text_y = (height - text_height) / 2
            draw.text((text_x, text_y), message, fill="black", font=font)
            return blank_image, None

        model = YOLO(model_id)
        results = model.predict(
            source=image,
            conf=conf_threshold,
            iou=iou_threshold,
            imgsz=640,
            max_det=max_detection,
            show_labels=True,
            show_conf=True,
        )
        for r in results:
            image_array = r.plot()
            annotated_image = Image.fromarray(image_array[..., ::-1])
        return annotated_image, None

    elif input_type == "Video":
        if video is None:
            width, height = 640, 480
            blank_image = Image.new("RGB", (width, height), color="white")
            draw = ImageDraw.Draw(blank_image)
            message = "No video provided"
            font = ImageFont.load_default(size=40)
            bbox = draw.textbbox((0, 0), message, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]
            text_x = (width - text_width) / 2
            text_y = (height - text_height) / 2
            draw.text((text_x, text_y), message, fill="black", font=font)
            temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
            out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
            frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
            out.write(frame)
            out.release()
            return None, temp_video_file

        model = YOLO(model_id)
        cap = cv2.VideoCapture(video)
        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            results = model.predict(
                source=pil_frame,
                conf=conf_threshold,
                iou=iou_threshold,
                imgsz=640,
                max_det=max_detection,
                show_labels=True,
                show_conf=True,
            )
            for r in results:
                annotated_frame_array = r.plot()
                annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
            frames.append(annotated_frame)
        cap.release()
        if len(frames) == 0:
            return None, None

        height_out, width_out, _ = frames[0].shape
        temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
        for f in frames:
            f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
            out.write(f_bgr)
        out.release()
        return None, temp_video_file

    else:
        return None, None

def update_visibility(input_type):
    """
    Show/hide image/video input and output depending on input_type.
    """
    if input_type == "Image":
        # image, video, output_image, output_video
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)

def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
    """
    This is called by gr.Examples. We force the radio to 'Image'
    and then do a standard image inference, returning both updated radio
    value and the annotated image.
    """
    annotated_image, _ = yolo_inference(
        input_type="Image",
        image=image,
        video=None,
        model_id=model_id,
        conf_threshold=conf_threshold,
        iou_threshold=iou_threshold,
        max_detection=max_detection
    )
    return gr.update(value="Image"), annotated_image

with gr.Blocks() as app:
    gr.Markdown("# Yolo11: Object Detection, Instance Segmentation, Pose/Keypoints, Oriented Detection, Classification")
    gr.Markdown("Upload image(s) or video(s) for inference using the latest Ultralytics YOLO11 models.")
    
    with gr.Row():
        with gr.Column():
            image = gr.Image(type="pil", label="Image", visible=True)
            video = gr.Video(label="Video", visible=False)
            input_type = gr.Radio(
                choices=["Image", "Video"],
                value="Image",
                label="Input Type",
            )
            model_id = gr.Dropdown(
                label="Model Name",
                choices=[
                    'yolo11n.pt', 'yolo11s.pt', 'yolo11m.pt', 'yolo11l.pt', 'yolo11x.pt', 
                    'yolo11n-seg.pt', 'yolo11s-seg.pt', 'yolo11m-seg.pt', 'yolo11l-seg.pt', 'yolo11x-seg.pt',
                    'yolo11n-pose.pt', 'yolo11s-pose.pt', 'yolo11m-pose.pt', 'yolo11l-pose.pt', 'yolo11x-pose.pt',
                    'yolo11n-obb.pt', 'yolo11s-obb.pt', 'yolo11m-obb.pt', 'yolo11l-obb.pt', 'yolo11x-obb.pt',
                    'yolo11n-cls.pt', 'yolo11s-cls.pt', 'yolo11m-cls.pt', 'yolo11l-cls.pt', 'yolo11x-cls.pt'
                ],
                value="yolo11n.pt",
            )
            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
            infer_button = gr.Button("Detect Objects")
        with gr.Column():
            output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
            output_video = gr.Video(label="Annotated Video", visible=False)
    
    # Toggle input/output visibility
    input_type.change(
        fn=update_visibility,
        inputs=input_type,
        outputs=[image, video, output_image, output_video],
    )
    
    # Main inference for button click
    infer_button.click(
        fn=yolo_inference,
        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
        outputs=[output_image, output_video],
    )


if __name__ == '__main__':
    app.launch()