import modal
import numpy as np
import supervision as sv
from smolagents import Tool

from modal_apps.app import app
from modal_apps.segment_anything import SegmentAnythingModalApp


def get_detections_from_segment_anything(detections, list_of_masks, iou_scores):
    bounding_boxes = detections.xyxy.tolist()
    detections = sv.Detections(
        xyxy=np.array(bounding_boxes),
        mask=np.array(list_of_masks),
        class_id=np.array(list(range(len(bounding_boxes)))),
        confidence=np.array(iou_scores),
    )
    return detections


class SegmentAnythingTool(Tool):
    name = "segment_anything"
    description = """
        Given an image and an already detected object (a sv.Detections object), segment the image and return masks for each bounding box.
        The image is a PIL image.
        The detections are an object of type sv.Detections, obtainable from the usage of the object_detection tool with task_inference_output_converter.
        
        The output is the same as the input, but with the masks added.
    """

    inputs = {
        "image": {
            "type": "image",
            "description": "The image to segment",
        },
        "detections": {
            "type": "object",
            "description": """
            The detections to segment the image with. 
            The detections are an object of type supervision.Detections.
            """,
        },
    }
    output_type = "object"

    def __init__(self):
        super().__init__()
        self.modal_app = modal.Cls.from_name(app.name, SegmentAnythingModalApp.__name__)()

    def forward(
        self,
        image,
        detections: sv.Detections,
    ):
        bounding_boxes = detections.xyxy.tolist()
        masks, iou_scores = self.modal_app.forward.remote(image=image, bounding_boxes=bounding_boxes)
        detections = get_detections_from_segment_anything(detections, masks, iou_scores)
        return detections