from typing import Tuple

import gradio as gr
import numpy as np
import supervision as sv
import torch
from PIL import Image
from transformers import SamModel, SamProcessor

from utils.efficient_sam import load, inference_with_box

MARKDOWN = """
# EfficientSAM sv. SAM

This is a demo for comparing the performance of 
[EfficientSAM](https://arxiv.org/abs/2312.00863) and 
[SAM](https://arxiv.org/abs/2304.02643).
"""

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
EFFICIENT_SAM_MODEL = load(device=DEVICE)
MASK_ANNOTATOR = sv.MaskAnnotator(
    color=sv.Color.red(),
    color_lookup=sv.ColorLookup.INDEX)
BOX_ANNOTATOR = sv.BoundingBoxAnnotator(
    color=sv.Color.red(),
    color_lookup=sv.ColorLookup.INDEX)


def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray:
    bgr_image = image[:, :, ::-1]
    annotated_bgr_image = MASK_ANNOTATOR.annotate(
        scene=bgr_image, detections=detections)
    annotated_bgr_image = BOX_ANNOTATOR.annotate(
        scene=annotated_bgr_image, detections=detections)
    return annotated_bgr_image[:, :, ::-1]


def efficient_sam_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    box = np.array([[x_min, y_min], [x_max, y_max]])
    mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image(image=image, detections=detections)


def sam_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    input_boxes = [[[x_min, y_min, x_max, y_max]]]
    inputs = SAM_PROCESSOR(
        Image.fromarray(image),
        input_boxes=[input_boxes],
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = SAM_MODEL(**inputs)

    mask = SAM_PROCESSOR.image_processor.post_process_masks(
        outputs.pred_masks.cpu(),
        inputs["original_sizes"].cpu(),
        inputs["reshaped_input_sizes"].cpu()
    )[0][0][0].numpy()
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image(image=image, detections=detections)


def inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> Tuple[np.ndarray, np.ndarray]:
    return (
        efficient_sam_inference(image, x_min, y_min, x_max, y_max),
        sam_inference(image, x_min, y_min, x_max, y_max)
    )


def clear(_: np.ndarray) -> Tuple[None, None]:
    return None, None


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Tab(label="Box prompt"):
        with gr.Row():
            with gr.Column():
                input_image = gr.Image()
                with gr.Accordion(label="Box", open=False):
                    with gr.Row():
                        x_min_number = gr.Number(label="x_min")
                        y_min_number = gr.Number(label="y_min")
                        x_max_number = gr.Number(label="x_max")
                        y_max_number = gr.Number(label="y_max")
            efficient_sam_output_image = gr.Image(label="EfficientSAM")
            sam_output_image = gr.Image(label="SAM")
        with gr.Row():
            submit_button = gr.Button("Submit")

        gr.Examples(
            fn=inference,
            examples=[
                [
                    'https://media.roboflow.com/efficient-sam/beagle.jpeg',
                    69,
                    26,
                    625,
                    704
                ],
                [
                    'https://media.roboflow.com/efficient-sam/corgi.jpg',
                    801,
                    510,
                    1782,
                    993
                ],
                [
                    'https://media.roboflow.com/efficient-sam/horses.jpg',
                    814,
                    696,
                    1523,
                    1183
                ],
                [
                    'https://media.roboflow.com/efficient-sam/bears.jpg',
                    653,
                    874,
                    1173,
                    1229
                ]
            ],
            inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
            outputs=[efficient_sam_output_image, sam_output_image],
        )

    submit_button.click(
        efficient_sam_inference,
        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
        outputs=efficient_sam_output_image
    )
    submit_button.click(
        sam_inference,
        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
        outputs=sam_output_image
    )
    input_image.change(
        clear,
        inputs=input_image,
        outputs=[efficient_sam_output_image, sam_output_image]
    )

demo.launch(debug=False, show_error=True)