import modal from smolagents import Tool from modal_apps.app import app from modal_apps.inference_pipeline import InferencePipelineModalApp class ObjectDetectionTool(Tool): name = "object_detection" description = """ Given an image, detect objects and return bounding boxes. The image is a PIL image. The output is a list of dictionaries containing the bounding boxes with the following keys: - box: a dictionary with the following keys: - xmin: a number - ymin: a number - xmax: a number - ymax: a number - score: a number between 0 and 1 - label: a string You need to provide the model name to use for object detection. The tool returns a list of bounding boxes for all the objects in the image. You also need to provide a score threshold to filter the bounding boxes. """ inputs = { "image": { "type": "image", "description": "The image to detect objects in", }, "model_name": { "type": "string", "description": "The name of the model to use for object detection", }, "threshold": { "type": "number", "description": "The score threshold of the bounding boxes to return", }, } output_type = "object" def __init__(self): super().__init__() self.modal_app = modal.Cls.from_name(app.name, InferencePipelineModalApp.__name__)() def forward( self, image, model_name: str, threshold: float, ): bboxes = self.modal_app.forward.remote( model_name=model_name, task="object-detection", image=image, threshold=threshold ) for bbox in bboxes: print(f"Found bounding box of {bbox['label']} with score: {bbox['score']} at box: {bbox['box']}") return bboxes