Spaces:

SkalskiP
/

EfficientSAM

Running on CPU Upgrade

File size: 5,162 Bytes

from typing import Tuple

import gradio as gr
import numpy as np
import supervision as sv
import torch
from PIL import Image
from transformers import SamModel, SamProcessor

from utils.efficient_sam import load, inference_with_box

MARKDOWN = """
# EfficientSAM sv. SAM

This is a demo for comparing the performance of 
[EfficientSAM](https://arxiv.org/abs/2312.00863) and 
[SAM](https://arxiv.org/abs/2304.02643).
"""

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
EFFICIENT_SAM_MODEL = load(device=DEVICE)
MASK_ANNOTATOR = sv.MaskAnnotator(
    color=sv.Color.red(),
    color_lookup=sv.ColorLookup.INDEX)
BOX_ANNOTATOR = sv.BoundingBoxAnnotator(
    color=sv.Color.red(),
    color_lookup=sv.ColorLookup.INDEX)


def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray:
    bgr_image = image[:, :, ::-1]
    annotated_bgr_image = MASK_ANNOTATOR.annotate(
        scene=bgr_image, detections=detections)
    annotated_bgr_image = BOX_ANNOTATOR.annotate(
        scene=annotated_bgr_image, detections=detections)
    return annotated_bgr_image[:, :, ::-1]


def efficient_sam_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    box = np.array([[x_min, y_min], [x_max, y_max]])
    mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image(image=image, detections=detections)


def sam_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    input_boxes = [[[x_min, y_min, x_max, y_max]]]
    inputs = SAM_PROCESSOR(
        Image.fromarray(image),
        input_boxes=[input_boxes],
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = SAM_MODEL(**inputs)

    mask = SAM_PROCESSOR.image_processor.post_process_masks(
        outputs.pred_masks.cpu(),
        inputs["original_sizes"].cpu(),
        inputs["reshaped_input_sizes"].cpu()
    )[0][0][0].numpy()
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image(image=image, detections=detections)


def inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> Tuple[np.ndarray, np.ndarray]:
    return (
        efficient_sam_inference(image, x_min, y_min, x_max, y_max),
        sam_inference(image, x_min, y_min, x_max, y_max)
    )


def clear(_: np.ndarray) -> Tuple[None, None]:
    return None, None


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Tab(label="Box prompt"):
        with gr.Row():
            with gr.Column():
                input_image = gr.Image()
                with gr.Accordion(label="Box", open=False):
                    with gr.Row():
                        x_min_number = gr.Number(label="x_min")
                        y_min_number = gr.Number(label="y_min")
                        x_max_number = gr.Number(label="x_max")
                        y_max_number = gr.Number(label="y_max")
            efficient_sam_output_image = gr.Image(label="EfficientSAM")
            sam_output_image = gr.Image(label="SAM")
        with gr.Row():
            submit_button = gr.Button("Submit")

        gr.Examples(
            fn=inference,
            examples=[
                [
                    'https://media.roboflow.com/efficient-sam/beagle.jpeg',
                    69,
                    26,
                    625,
                    704
                ],
                [
                    'https://media.roboflow.com/efficient-sam/corgi.jpg',
                    801,
                    510,
                    1782,
                    993
                ],
                [
                    'https://media.roboflow.com/efficient-sam/horses.jpg',
                    814,
                    696,
                    1523,
                    1183
                ],
                [
                    'https://media.roboflow.com/efficient-sam/bears.jpg',
                    653,
                    874,
                    1173,
                    1229
                ]
            ],
            inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
            outputs=[efficient_sam_output_image, sam_output_image],
        )

    submit_button.click(
        efficient_sam_inference,
        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
        outputs=efficient_sam_output_image
    )
    submit_button.click(
        sam_inference,
        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
        outputs=sam_output_image
    )
    input_image.change(
        clear,
        inputs=input_image,
        outputs=[efficient_sam_output_image, sam_output_image]
    )

demo.launch(debug=False, show_error=True)