Spaces:

Agents-MCP-Hackathon
/

ScouterAI

Running

stevenbucaille commited on 12 days ago

Commit

7e327f2

1 Parent(s): 4fbddf5

Add initial project structure with core functionality for image processing agents

- Created .gitignore to exclude Python-generated files and virtual environments.
- Added .python-version to specify Python version 3.10.
- Implemented main LLM functionality in llm.py.
- Defined project metadata and dependencies in pyproject.toml and requirements.txt.
- Developed image processing tools including bounding box drawing, cropping, and upscaling.
- Integrated object detection and model retrieval capabilities in remote tools.
- Established dataset creation and knowledge base preparation scripts.
- Set up initial modal application structure for remote processing.

Files changed (21) hide show

.gitignore +12 -0
.python-version +1 -0
agents/all_agents.py +37 -0
llm.py +22 -0
pyproject.toml +32 -0
rag/__init__.py +0 -0
rag/create_dataset.py +81 -0
rag/prepare_knowledge_base.py +56 -0
rag/settings.py +22 -0
remote_tools/app.py +5 -0
remote_tools/deploy.py +10 -0
remote_tools/image.py +28 -0
remote_tools/object_detection_tool.py +86 -0
remote_tools/rag_tool.py +65 -0
remote_tools/upscaler.py +66 -0
remote_tools/volume.py +3 -0
requirements.txt +165 -0
tools/bbox_drawing_tool.py +58 -0
tools/cropping_tool.py +55 -0
tools/hf_api_tool.py +26 -0
tools/rag_tool.py +38 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.env

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

agents/all_agents.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from smolagents import CodeAgent, LogLevel
+from remote_tools.rag_tool import RemoteObjectDetectionModelRetrieverTool
+from tools.bbox_drawing_tool import BBoxDrawingTool
+from tools.cropping_tool import CroppingTool
+from remote_tools.object_detection_tool import RemoteObjectDetectionTool
+from remote_tools.upscaler import RemoteUpscalerTool
+def get_master_agent(llm):
+    description = """
+    You are an agent that can perform tasks on an image.
+    You can use the following tools to perform tasks on an image:
+    - object_detection_tool: to detect objects in an image, you must provide the image to the agents.
+    - object_detection_model_retriever: to retrieve object detection models, you must provide the type of class that a model can detect.
+    If you don't know what model to use, you can use the object_detection_model_retriever tool to retrieve the model.
+    Never assume an invented model name, always use the model name provided by the object_detection_model_retriever tool.
+    Use batching to perform tasks on multiple images at once when a tool supports it.
+    You have access to the variable "image" which is the image to perform tasks on, no need to load it, it is already loaded.
+    You can also use opencv to draw the bounding boxes on the image.
+    Always use the variable "image" to draw the bounding boxes on the image.
+    """
+    master_agent = CodeAgent(
+        name="master_agent",
+        description=description,
+        model=llm,
+        tools=[
+            RemoteObjectDetectionTool(),
+            BBoxDrawingTool(),
+            CroppingTool(),
+            RemoteUpscalerTool(),
+            RemoteObjectDetectionModelRetrieverTool(),
+        ],
+        verbosity_level=LogLevel.DEBUG,
+    )
+    print("Loaded master agent")
+    return master_agent

llm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from smolagents import OpenAIServerModel, LiteLLMModel
+import os
+LOCAL_LLM_SETTINGS = {
+    "api_base": "http://127.0.0.1:1234/v1",
+    "api_key": "api-key",
+    "model_id": "gemma-3-12b-it-qat",
+}
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+assert ANTHROPIC_API_KEY is not None, "ANTHROPIC_API_KEY is not set"
+def get_default_model():
+    model = LiteLLMModel(
+        model_id="claude-3-7-sonnet-20250219",
+        api_key=os.getenv("ANTHROPIC_API_KEY"),
+        reasoning_effort="low",
+    )
+    print("Loaded LLM model")
+    return model

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[project]
+name = "image-processing-agent"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.7.0",
+    "datasets>=3.6.0",
+    "diffusers>=0.33.1",
+    "faiss-cpu>=1.11.0",
+    "faiss-gpu>=1.7.2",
+    "gradio>=5.33.0",
+    "hf-transfer>=0.1.9",
+    "huggingface-hub[cli]>=0.32.4",
+    "langchain>=0.3.25",
+    "langchain-community>=0.3.24",
+    "langchain-huggingface>=0.2.0",
+    "langchain-openai>=0.3.19",
+    "matplotlib>=3.10.3",
+    "modal>=1.0.3",
+    "opencv-python>=4.11.0.86",
+    "pandas>=2.3.0",
+    "rank-bm25>=0.2.2",
+    "safetensors>=0.5.3",
+    "scipy>=1.15.3",
+    "sentence-transformers>=4.1.0",
+    "smolagents[litellm,openai]>=1.17.0",
+    "timm>=1.0.15",
+    "torch>=2.7.1",
+    "transformers>=4.52.4",
+]

rag/__init__.py ADDED Viewed

File without changes

rag/create_dataset.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import json
+from smolagents import Tool
+from huggingface_hub import HfApi, hf_hub_download, ModelCard
+from datasets import Dataset, Features, Value
+def get_model_ids(pipeline_tag: str) -> list[str]:
+    hf_api = HfApi()
+    models = hf_api.list_models(
+        library=["transformers"],
+        pipeline_tag=pipeline_tag,
+        gated=False,
+        fetch_config=True,
+    )
+    models = list(models)
+    model_ids = [model.id for model in models]
+    return model_ids
+def get_model_card(model_id: str) -> str:
+    try:
+        model_card = ModelCard.load(model_id)
+        return model_card.text
+    except Exception as e:
+        return ""
+def get_model_labels(model_id: str) -> list[str]:
+    hf_api = HfApi()
+    if hf_api.file_exists(model_id, filename="config.json"):
+        config_path = hf_hub_download(model_id, filename="config.json")
+        with open(config_path, "r") as f:
+            try:
+                model_config = json.load(f)
+            except json.JSONDecodeError:
+                return [""]
+        if "id2label" in model_config:
+            labels = list(model_config["id2label"].values())
+            labels = [str(label).lower() for label in labels]
+            return labels
+        else:
+            return [""]
+    else:
+        return [""]
+def create_dataset(pipeline_tag: str):
+    def dataset_gen(model_ids: list[str]):
+        for model_id in model_ids:
+            model_card = get_model_card(model_id)
+            model_labels = get_model_labels(model_id)
+            if len(model_labels) > 1 and len(model_card) > 0:
+                yield {
+                    "model_id": model_id,
+                    "model_card": model_card,
+                    "model_labels": model_labels,
+                }
+    model_ids = get_model_ids(pipeline_tag)
+    dataset = Dataset.from_generator(
+        dataset_gen,
+        gen_kwargs={"model_ids": model_ids},
+        features=Features(
+            {
+                "model_id": Value("string"),
+                "model_card": Value("string"),
+                "model_labels": [Value("string")],
+            }
+        ),
+        num_proc=12,
+    )
+    return dataset
+if __name__ == "__main__":
+    dataset = create_dataset("object-detection")
+    print(dataset)
+    dataset.push_to_hub("stevenbucaille/object-detection-models-dataset", )
+    # dataset.push_to_hub("stevenbucaille/object-detection-models-dataset")

rag/prepare_knowledge_base.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import datasets
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from rag.settings import get_embeddings_model
+def get_vector_store():
+    embeddings = get_embeddings_model()
+    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
+    vector_store = FAISS(
+        embedding_function=embeddings,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+    )
+    return vector_store
+def get_docs(dataset):
+    source_docs = [
+        Document(
+            page_content=model["model_card"],
+            metadata={
+                "model_id": model["model_id"],
+                "model_labels": model["model_labels"],
+            },
+        )
+        for model in dataset
+    ]
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,  # Characters per chunk
+        chunk_overlap=50,  # Overlap between chunks to maintain context
+        add_start_index=True,
+        strip_whitespace=True,
+        separators=["\n\n", "\n", ".", " ", ""],  # Priority order for splitting
+    )
+    docs_processed = text_splitter.split_documents(source_docs)
+    print(f"Knowledge base prepared with {len(docs_processed)} document chunks")
+    return docs_processed
+if __name__ == "__main__":
+    dataset = datasets.load_dataset(
+        "stevenbucaille/object-detection-models-dataset", split="train"
+    )
+    docs_processed = get_docs(dataset)
+    vector_store = get_vector_store()
+    vector_store.add_documents(docs_processed)
+    vector_store.save_local(
+        folder_path="vector_store",
+        index_name="object_detection_models_faiss_index",
+    )

rag/settings.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+def get_embeddings_model():
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs={"device": "cuda"},
+        encode_kwargs={"normalize_embeddings": True},
+        show_progress=True,
+    )
+    print("Loaded embeddings model")
+    return embeddings
+def get_vector_store():
+    return FAISS.load_local(
+        folder_path="vector_store",
+        embeddings=get_embeddings_model(),
+        index_name="object_detection_models_faiss_index",
+        allow_dangerous_deserialization=True,
+    )

remote_tools/app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import modal
+from .image import image
+app = modal.App("image-agent-tools", image=image)

remote_tools/deploy.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import modal
+from .app import app
+from .object_detection_tool import app as object_detection_tool_app
+from .upscaler import app as upscaler_tool_app
+from .rag_tool import app as rag_tool_app
+app.include(object_detection_tool_app)
+app.include(upscaler_tool_app)
+app.include(rag_tool_app)

remote_tools/image.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import modal
+cuda_version = "12.4.0"  # should be no greater than host CUDA version
+flavor = "devel"  # includes full CUDA toolkit
+operating_sys = "ubuntu22.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+cuda_dev_image = modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+image = (
+    cuda_dev_image.apt_install(
+        "git",
+        "libglib2.0-0",
+        "libsm6",
+        "libxrender1",
+        "libxext6",
+        "ffmpeg",
+        "libgl1",
+    )
+    .add_local_file("requirements.txt", "/app_requirements.txt", copy=True)
+    .run_commands(
+        [
+            "cat /app_requirements.txt",
+            "uv pip install --system --requirement /app_requirements.txt",
+        ]
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
+)

remote_tools/object_detection_tool.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import modal
+from transformers import AutoModelForObjectDetection, AutoImageProcessor
+import torch
+from smolagents import Tool
+from .app import app
+from .image import image
+@app.cls(gpu="T4", image=image)
+class RemoteObjectDetectionModalApp:
+    model_name: str = modal.parameter()
+    @modal.method()
+    def forward(self, image):
+        self.model = AutoModelForObjectDetection.from_pretrained(self.model_name)
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name)
+        self.model.eval()
+        # Preprocess image
+        inputs = self.processor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([image.size[::-1]])  # (height, width)
+        results = self.processor.post_process_object_detection(
+            outputs, target_sizes=target_sizes, threshold=0.5
+        )[0]
+        boxes = []
+        for score, label, box in zip(
+            results["scores"], results["labels"], results["boxes"]
+        ):
+            boxes.append(
+                {
+                    "box": box.tolist(),  # [xmin, ymin, xmax, ymax]
+                    "score": score.item(),
+                    "label": self.model.config.id2label[label.item()],
+                }
+            )
+        return boxes
+class RemoteObjectDetectionTool(Tool):
+    name = "object_detection"
+    description = """
+        Given an image, detect objects and return bounding boxes.
+        The image is a PIL image.
+        The output is a list of dictionaries containing the bounding boxes with the following keys:
+        - box: a list of 4 numbers [xmin, ymin, xmax, ymax]
+        - score: a number between 0 and 1
+        - label: a string
+        The bounding boxes are in the format of [xmin, ymin, xmax, ymax].
+        You need to provide the model name to use for object detection.
+        The tool returns a list of bounding boxes for all the objects in the image.
+    """
+    inputs = {
+        "image": {
+            "type": "image",
+            "description": "The image to detect objects in",
+        },
+        "model_name": {
+            "type": "string",
+            "description": "The name of the model to use for object detection",
+        },
+    }
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+        self.tool_class = modal.Cls.from_name(
+            app.name, RemoteObjectDetectionModalApp.__name__
+        )
+    def forward(
+        self,
+        image,
+        model_name: str,
+    ):
+        self.tool = self.tool_class(model_name=model_name)
+        bboxes = self.tool.forward.remote(image)
+        for bbox in bboxes:
+            print(
+                f"Found {bbox['label']} with score: {bbox['score']} at box: {bbox['box']}"
+            )
+        return bboxes

remote_tools/rag_tool.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from smolagents import Tool
+import modal
+from .app import app
+from .image import image
+from .volume import volume
+@app.cls(gpu="T4", image=image, volumes={"/volume": volume})
+class RemoteObjectDetectionModelRetrieverModalApp:
+    @modal.enter()
+    def setup(self):
+        self.vector_store = FAISS.load_local(
+            folder_path="/volume/vector_store",
+            embeddings=HuggingFaceEmbeddings(
+                model_name="all-MiniLM-L6-v2",
+                model_kwargs={"device": "cuda"},
+                encode_kwargs={"normalize_embeddings": True},
+                show_progress=True,
+            ),
+            index_name="object_detection_models_faiss_index",
+            allow_dangerous_deserialization=True,
+        )
+    @modal.method()
+    def forward(self, query: str) -> str:
+        docs = self.vector_store.similarity_search(query, k=7)
+        model_ids = [doc.metadata["model_id"] for doc in docs]
+        model_labels = [doc.metadata["model_labels"] for doc in docs]
+        models_dict = {
+            model_id: model_labels
+            for model_id, model_labels in zip(model_ids, model_labels)
+        }
+        return models_dict
+class RemoteObjectDetectionModelRetrieverTool(Tool):
+    name = "object_detection_model_retriever"
+    description = """
+    For a given class of objects, retrieve the models that can detect that class.
+    The query is a string that describes the class of objects the model needs to detect.
+    The output is a dictionary with the model id as the key and the labels that the model can detect as the value.
+    """
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The class of objects the model needs to detect.",
+        }
+    }
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+        self.tool_class = modal.Cls.from_name(
+            app.name, RemoteObjectDetectionModelRetrieverModalApp.__name__
+        )
+    def forward(self, query: str) -> str:
+        assert isinstance(query, str), "Your search query must be a string"
+        tool = self.tool_class()
+        result = tool.forward.remote(query)
+        return result

remote_tools/upscaler.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import modal
+import torch
+from smolagents import AgentImage, Tool
+from diffusers import StableDiffusionUpscalePipeline
+from .app import app
+from .image import image
+@app.cls(gpu="T4", image=image, scaledown_window=60 * 5)
+class RemoteUpscalerModalApp:
+    @modal.enter()
+    def setup(self):
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        self.pipeline = StableDiffusionUpscalePipeline.from_pretrained(
+            model_id, torch_dtype=torch.float16
+        )
+        self.pipeline = self.pipeline.to("cuda")
+    @modal.batched(max_batch_size=4, wait_ms=1000)
+    def forward(self, low_res_imgs, prompts: list[str]):
+        print(len(low_res_imgs))
+        print(low_res_imgs)
+        print(prompts)
+        low_res_imgs = [
+            img.resize(
+                (min(512, img.width), min(512, img.height))
+            ) for img in low_res_imgs
+        ]
+        upscaled_images = self.pipeline(prompt=prompts, image=low_res_imgs).images
+        return upscaled_images
+class RemoteUpscalerTool(Tool):
+    name = "upscaler"
+    description = """
+        Perform upscaling on images.
+        The "low_res_imgs" are PIL images.
+        The "prompts" are strings.
+        The output is a list of PIL images.
+        You can upscale multiple images at once.
+    """
+    inputs = {
+        "low_res_imgs": {
+            "type": "array",
+            "description": "The low resolution images to upscale",
+        },
+        "prompts": {
+            "type": "array",
+            "description": "The prompts to upscale the images",
+        },
+    }
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+        tool_class = modal.Cls.from_name(app.name, RemoteUpscalerModalApp.__name__)
+        self.tool = tool_class()
+    def forward(self, low_res_imgs: list[AgentImage], prompts: list[str]):
+        # Modal's forward.map() handles batching internally
+        # We can use it synchronously since Modal manages the async execution
+        upscaled_images = self.tool.forward.map(low_res_imgs, prompts)
+        # Convert the generator to a list to get all results
+        return list(upscaled_images)

remote_tools/volume.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import modal
2	+
3	+ volume = modal.Volume.from_name("hackathon")

requirements.txt ADDED Viewed

	@@ -0,0 +1,165 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt --no-annotate
+accelerate==1.7.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.9
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+async-timeout==4.0.3
+attrs==25.3.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.1.8
+contourpy==1.3.2
+cycler==0.12.1
+dataclasses-json==0.6.7
+datasets==3.6.0
+diffusers==0.33.1
+dill==0.3.8
+distro==1.9.0
+exceptiongroup==1.3.0
+faiss-cpu==1.11.0
+faiss-gpu==1.7.2
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.58.1
+frozenlist==1.6.2
+fsspec==2025.3.0
+gradio==5.33.0
+gradio-client==1.10.2
+greenlet==3.2.3
+groovy==0.1.2
+grpclib==0.4.7
+h11==0.16.0
+h2==4.2.0
+hf-transfer==0.1.9
+hf-xet==1.1.3
+hpack==4.1.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.32.4
+hyperframe==6.1.0
+idna==3.10
+importlib-metadata==8.7.0
+inquirerpy==0.3.4
+jinja2==3.1.6
+jiter==0.10.0
+joblib==1.5.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+langchain==0.3.25
+langchain-community==0.3.24
+langchain-core==0.3.64
+langchain-huggingface==0.2.0
+langchain-openai==0.3.21
+langchain-text-splitters==0.3.8
+langsmith==0.3.45
+litellm==1.72.1
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+marshmallow==3.26.1
+matplotlib==3.10.3
+mdurl==0.1.2
+modal==1.0.3
+mpmath==1.3.0
+multidict==6.4.4
+multiprocess==0.70.16
+mypy-extensions==1.1.0
+networkx==3.4.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+openai==1.84.0
+opencv-python==4.11.0.86
+orjson==3.10.18
+packaging==24.2
+pandas==2.3.0
+pfzy==0.3.4
+pillow==11.2.1
+prompt-toolkit==3.0.51
+propcache==0.3.1
+protobuf==6.31.1
+psutil==7.0.0
+pyarrow==20.0.0
+pydantic==2.11.5
+pydantic-core==2.33.2
+pydantic-settings==2.9.1
+pydub==0.25.1
+pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+rank-bm25==0.2.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==14.0.0
+rpds-py==0.25.1
+ruff==0.11.12
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.7.0
+scipy==1.15.3
+semantic-version==2.10.0
+sentence-transformers==4.1.0
+setuptools==80.9.0
+shellingham==1.5.4
+sigtools==4.0.1
+six==1.17.0
+smolagents==1.17.0
+sniffio==1.3.1
+sqlalchemy==2.0.41
+starlette==0.46.2
+sympy==1.14.0
+synchronicity==0.9.13
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+timm==1.0.15
+tokenizers==0.21.1
+toml==0.10.2
+tomlkit==0.13.3
+torch==2.7.1
+torchvision==0.22.1
+tqdm==4.67.1
+transformers==4.52.4
+triton==3.3.1
+typer==0.16.0
+types-certifi==2021.10.8.3
+types-toml==0.10.8.20240310
+typing-extensions==4.14.0
+typing-inspect==0.9.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+watchfiles==1.0.5
+wcwidth==0.2.13
+websockets==15.0.1
+xxhash==3.5.0
+yarl==1.20.0
+zipp==3.22.0
+zstandard==0.23.0

tools/bbox_drawing_tool.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import List, Union, Dict
+from smolagents import Tool, AgentImage
+import cv2
+import numpy as np
+from PIL import Image
+class BBoxDrawingTool(Tool):
+    name = "bbox_drawing"
+    description = """
+        Given an image and a list of bounding boxes, draw the bounding boxes on the image.
+        The image is a PIL image.
+        The bounding boxes are a list of dictionaries with the following keys:
+        - box: a list of 4 numbers [xmin, ymin, xmax, ymax]
+        - score: a number between 0 and 1
+        - label: a string.
+        The output is the image with the bounding boxes drawn on it.
+    """
+    inputs = {
+        "image": {
+            "type": "image",
+            "description": "The image to draw the bounding boxes on",
+        },
+        "bboxes": {
+            "type": "array",
+            "description": "The list of bounding boxes to draw on the image",
+        },
+    }
+    output_type = "image"
+    def __init__(self):
+        super().__init__()
+    def forward(
+        self,
+        image: AgentImage,
+        bboxes: List[Dict[str, Union[str, float, List]]],
+    ):
+        np_image = np.array(image)
+        cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR)
+        for bbox in bboxes:
+            print(bbox)
+            print(bbox["box"])
+            cv2_image = self.draw_bbox(cv2_image, bbox["box"])
+        pil_image = Image.fromarray(cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB))
+        return pil_image
+    def draw_bbox(self, image: AgentImage, bbox: List[int]):
+        x1, y1, x2, y2 = tuple(bbox)
+        x1 = int(x1)
+        y1 = int(y1)
+        x2 = int(x2)
+        y2 = int(y2)
+        image = cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
+        return image

tools/cropping_tool.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from smolagents import Tool, AgentImage
+class CroppingTool(Tool):
+    name = "cropping"
+    description = """
+        Given a list of images and a list of bounding boxes, crop the images to the specified regions.
+        The images are PIL images.
+        The bounding boxes are lists of 4 numbers [xmin, ymin, xmax, ymax] for each image.
+        The output is a list of cropped PIL images.
+        You can crop multiple images at once.
+        You need the same number of images and bounding boxes.
+    """
+    inputs = {
+        "images": {
+            "type": "array",
+            "description": "The images to crop",
+        },
+        "bboxes": {
+            "type": "array",
+            "description": "The bounding box coordinates [xmin, ymin, xmax, ymax] for each image",
+        },
+    }
+    output_type = "array"
+    def __init__(self):
+        super().__init__()
+    def setup(self):
+        pass
+    def forward(self, images: list[AgentImage], bboxes: list[list]):
+        if len(images) != len(bboxes):
+            raise ValueError(
+                "The number of images and bounding boxes must be the same."
+            )
+        cropped_images = []
+        for image, bbox in zip(images, bboxes):
+            # Convert bbox to integers
+            xmin, ymin, xmax, ymax = map(int, bbox)
+            # Ensure coordinates are within image bounds
+            width, height = image.size
+            xmin = max(0, min(xmin, width))
+            ymin = max(0, min(ymin, height))
+            xmax = max(0, min(xmax, width))
+            ymax = max(0, min(ymax, height))
+            # Crop the image
+            cropped_image = image.crop((xmin, ymin, xmax, ymax))
+            cropped_images.append(cropped_image)
+        return cropped_images

tools/hf_api_tool.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from smolagents import Tool
+from huggingface_hub import HfApi
+class HFAPITool(Tool):
+    name = "hf_api"
+    description = "Use the HuggingFace API to search for models"
+    inputs = {
+        "prompt": {
+            "type": "string",
+            "description": "The prompt to search for models",
+        },
+    }
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+        self.api = HfApi()
+    def forward(self, prompt: str):
+        models = self.api.list_models(
+            library=["transformers"], pipeline_tag="object-detection", fetch_config=True
+        )
+        print(models)

tools/rag_tool.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from langchain_community.vectorstores import FAISS
+from smolagents import Tool
+from rag.settings import get_vector_store
+class ObjectDetectionModelRetrieverTool(Tool):
+    name = "object_detection_model_retriever"
+    description = """
+    For a given class of objects, retrieve the models that can detect that class.
+    The query is a string that describes the class of objects the model needs to detect.
+    The output is a dictionary with the model id as the key and the labels that the model can detect as the value.
+    """
+    inputs = {
+        "query": {
+            "type": "object",
+            "description": "The class of objects the model needs to detect.",
+        }
+    }
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+    def setup(self):
+        self.vector_store = get_vector_store()
+        print("Loaded vector store")
+    def forward(self, query: str) -> str:
+        assert isinstance(query, str), "Your search query must be a string"
+        docs = self.vector_store.similarity_search(query, k=7)
+        model_ids = [doc.metadata["model_id"] for doc in docs]
+        model_labels = [doc.metadata["model_labels"] for doc in docs]
+        models_dict = {
+            model_id: model_labels
+            for model_id, model_labels in zip(model_ids, model_labels)
+        }
+        return models_dict