from typing import Tuple

import gradio as gr
import numpy as np
import supervision as sv
import torch
from PIL import Image
from transformers import SamModel, SamProcessor

from utils.efficient_sam import load, inference_with_box, inference_with_point
from utils.draw import draw_circle, calculate_dynamic_circle_radius

MARKDOWN = """
# EfficientSAM sv. SAM

This is a demo for ⚔️ SAM Battlegrounds - a speed and accuracy comparison between 
[EfficientSAM](https://arxiv.org/abs/2312.00863) and 
[SAM](https://arxiv.org/abs/2304.02643).
"""

BOX_EXAMPLES = [
    ['https://media.roboflow.com/efficient-sam/corgi.jpg', 801, 510, 1782, 993],
    ['https://media.roboflow.com/efficient-sam/horses.jpg', 814, 696, 1523, 1183],
    ['https://media.roboflow.com/efficient-sam/bears.jpg', 653, 874, 1173, 1229]
]

POINT_EXAMPLES = [
    ['https://media.roboflow.com/efficient-sam/corgi.jpg', 1291, 751],
    ['https://media.roboflow.com/efficient-sam/horses.jpg', 1168, 939],
    ['https://media.roboflow.com/efficient-sam/bears.jpg', 913, 1051]
]

PROMPT_COLOR = sv.Color.from_hex("#D3D3D3")
MASK_COLOR = sv.Color.from_hex("#FF0000")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
EFFICIENT_SAM_MODEL = load(device=DEVICE)
MASK_ANNOTATOR = sv.MaskAnnotator(
    color=MASK_COLOR,
    color_lookup=sv.ColorLookup.INDEX)


def annotate_image_with_box_prompt_result(
    image: np.ndarray,
    detections: sv.Detections,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    h, w, _ = image.shape
    bgr_image = image[:, :, ::-1]
    annotated_bgr_image = MASK_ANNOTATOR.annotate(
        scene=bgr_image, detections=detections)
    annotated_bgr_image = sv.draw_rectangle(
        scene=annotated_bgr_image,
        rect=sv.Rect(
            x=x_min,
            y=y_min,
            width=int(x_max - x_min),
            height=int(y_max - y_min),
        ),
        color=PROMPT_COLOR,
        thickness=sv.calculate_dynamic_line_thickness(resolution_wh=(w, h))
    )
    return annotated_bgr_image[:, :, ::-1]


def annotate_image_with_point_prompt_result(
    image: np.ndarray,
    detections: sv.Detections,
    x: int,
    y: int
) -> np.ndarray:
    h, w, _ = image.shape
    bgr_image = image[:, :, ::-1]
    annotated_bgr_image = MASK_ANNOTATOR.annotate(
        scene=bgr_image, detections=detections)
    annotated_bgr_image = draw_circle(
        scene=annotated_bgr_image,
        center=sv.Point(x=x, y=y),
        radius=calculate_dynamic_circle_radius(resolution_wh=(w, h)),
        color=PROMPT_COLOR)
    return annotated_bgr_image[:, :, ::-1]


def efficient_sam_box_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    box = np.array([[x_min, y_min], [x_max, y_max]])
    mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image_with_box_prompt_result(
        image=image,
        detections=detections,
        x_max=x_max,
        x_min=x_min,
        y_max=y_max,
        y_min=y_min
    )


def sam_box_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> np.ndarray:
    input_boxes = [[[x_min, y_min, x_max, y_max]]]
    inputs = SAM_PROCESSOR(
        Image.fromarray(image),
        input_boxes=[input_boxes],
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = SAM_MODEL(**inputs)

    mask = SAM_PROCESSOR.image_processor.post_process_masks(
        outputs.pred_masks.cpu(),
        inputs["original_sizes"].cpu(),
        inputs["reshaped_input_sizes"].cpu()
    )[0][0][0].numpy()
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image_with_box_prompt_result(
        image=image,
        detections=detections,
        x_max=x_max,
        x_min=x_min,
        y_max=y_max,
        y_min=y_min
    )


def box_inference(
    image: np.ndarray,
    x_min: int,
    y_min: int,
    x_max: int,
    y_max: int
) -> Tuple[np.ndarray, np.ndarray]:
    return (
        efficient_sam_box_inference(image, x_min, y_min, x_max, y_max),
        sam_box_inference(image, x_min, y_min, x_max, y_max)
    )


def efficient_sam_point_inference(image: np.ndarray, x: int, y: int) -> np.ndarray:
    point = np.array([[x, y]])
    mask = inference_with_point(image, point, EFFICIENT_SAM_MODEL, DEVICE)
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image_with_point_prompt_result(
        image=image, detections=detections, x=x, y=y)


def sam_point_inference(image: np.ndarray, x: int, y: int) -> np.ndarray:
    input_points = [[[x, y]]]
    inputs = SAM_PROCESSOR(
        Image.fromarray(image),
        input_points=[input_points],
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = SAM_MODEL(**inputs)

    mask = SAM_PROCESSOR.image_processor.post_process_masks(
        outputs.pred_masks.cpu(),
        inputs["original_sizes"].cpu(),
        inputs["reshaped_input_sizes"].cpu()
    )[0][0][0].numpy()
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    return annotate_image_with_point_prompt_result(
        image=image, detections=detections, x=x, y=y)


def point_inference(image: np.ndarray, x: int, y: int) -> Tuple[np.ndarray, np.ndarray]:
    return (
        efficient_sam_point_inference(image, x, y),
        sam_point_inference(image, x, y)
    )


def clear(_: np.ndarray) -> Tuple[None, None]:
    return None, None


box_input_image = gr.Image()
x_min_number = gr.Number(label="x_min")
y_min_number = gr.Number(label="y_min")
x_max_number = gr.Number(label="x_max")
y_max_number = gr.Number(label="y_max")
box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_number]

point_input_image = gr.Image()
x_number = gr.Number(label="x")
y_number = gr.Number(label="y")
point_inputs = [point_input_image, x_number, y_number]


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Tab(label="Box prompt"):
        with gr.Row():
            with gr.Column():
                box_input_image.render()
                with gr.Accordion(label="Box", open=False):
                    with gr.Row():
                        x_min_number.render()
                        y_min_number.render()
                        x_max_number.render()
                        y_max_number.render()
            efficient_sam_box_output_image = gr.Image(label="EfficientSAM")
            sam_box_output_image = gr.Image(label="SAM")
        with gr.Row():
            submit_box_inference_button = gr.Button("Submit")
        gr.Examples(
            fn=box_inference,
            examples=BOX_EXAMPLES,
            inputs=box_inputs,
            outputs=[efficient_sam_box_output_image, sam_box_output_image],
        )
    with gr.Tab(label="Point prompt"):
        with gr.Row():
            with gr.Column():
                point_input_image.render()
                with gr.Accordion(label="Point", open=False):
                    with gr.Row():
                        x_number.render()
                        y_number.render()
            efficient_sam_point_output_image = gr.Image(label="EfficientSAM")
            sam_point_output_image = gr.Image(label="SAM")
        with gr.Row():
            submit_point_inference_button = gr.Button("Submit")
        gr.Examples(
            fn=point_inference,
            examples=POINT_EXAMPLES,
            inputs=point_inputs,
            outputs=[efficient_sam_point_output_image, sam_point_output_image],
        )

    submit_box_inference_button.click(
        efficient_sam_box_inference,
        inputs=box_inputs,
        outputs=efficient_sam_box_output_image
    )
    submit_box_inference_button.click(
        sam_box_inference,
        inputs=box_inputs,
        outputs=sam_box_output_image
    )

    submit_point_inference_button.click(
        efficient_sam_point_inference,
        inputs=point_inputs,
        outputs=efficient_sam_point_output_image
    )
    submit_point_inference_button.click(
        sam_point_inference,
        inputs=point_inputs,
        outputs=sam_point_output_image
    )

    box_input_image.change(
        clear,
        inputs=box_input_image,
        outputs=[efficient_sam_box_output_image, sam_box_output_image]
    )

    point_input_image.change(
        clear,
        inputs=point_input_image,
        outputs=[efficient_sam_point_output_image, sam_point_output_image]
    )

demo.launch(debug=False, show_error=True)