Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 16

Commit

0db692a

1 Parent(s): dde4764

second version

Browse files

Files changed (6) hide show

.gitignore +7 -0
app.py +50 -45
audio_tools.py +95 -0
mini_agents.py +75 -13
tools.py +173 -3
vlm_tools.py +79 -7

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ class BasicAgent:
         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
@@ -62,9 +62,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
@@ -84,8 +84,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
@@ -98,46 +98,51 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
-    try:
-        response = requests.post(submit_url, json=submission_data, timeout=60)
-        response.raise_for_status()
-        result_data = response.json()
-        final_status = (
-            f"Submission Successful!\n"
-            f"User: {result_data.get('username')}\n"
-            f"Overall Score: {result_data.get('score', 'N/A')}% "
-            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
-            f"Message: {result_data.get('message', 'No message received.')}"
-        )
-        print("Submission successful.")
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
-    except requests.exceptions.HTTPError as e:
-        error_detail = f"Server responded with status {e.response.status_code}."
         try:
-            error_json = e.response.json()
-            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except requests.exceptions.JSONDecodeError:
-            error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.Timeout:
-        status_message = "Submission Failed: The request timed out."
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except Exception as e:
-        status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
 # --- Build Gradio Interface using Blocks ---

         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
+def run_and_submit_all( profile: gr.OAuthProfile | None, mock_submission: bool = True):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+    if mock_submission:
+        answer_df = pd.DataFrame(results_log, columns=["Task ID", "Question", "Submitted Answer"])
+        answer_df.to_csv("answers.csv", index=False)
+        return "Answers saved to answers.csv", answer_df
+    else:
         try:
+            response = requests.post(submit_url, json=submission_data, timeout=60)
+            response.raise_for_status()
+            result_data = response.json()
+            final_status = (
+                f"Submission Successful!\n"
+                f"User: {result_data.get('username')}\n"
+                f"Overall Score: {result_data.get('score', 'N/A')}% "
+                f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+                f"Message: {result_data.get('message', 'No message received.')}"
+            )
+            print("Submission successful.")
+            results_df = pd.DataFrame(results_log)
+            return final_status, results_df
+        except requests.exceptions.HTTPError as e:
+            error_detail = f"Server responded with status {e.response.status_code}."
+            try:
+                error_json = e.response.json()
+                error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+            except requests.exceptions.JSONDecodeError:
+                error_detail += f" Response: {e.response.text[:500]}"
+            status_message = f"Submission Failed: {error_detail}"
+            print(status_message)
+            results_df = pd.DataFrame(results_log)
+            return status_message, results_df
+        except requests.exceptions.Timeout:
+            status_message = "Submission Failed: The request timed out."
+            print(status_message)
+            results_df = pd.DataFrame(results_log)
+            return status_message, results_df
+        except requests.exceptions.RequestException as e:
+            status_message = f"Submission Failed: Network error - {e}"
+            print(status_message)
+            results_df = pd.DataFrame(results_log)
+            return status_message, results_df
+        except Exception as e:
+            status_message = f"An unexpected error occurred during submission: {e}"
+            print(status_message)
+            results_df = pd.DataFrame(results_log)
+            return status_message, results_df
 # --- Build Gradio Interface using Blocks ---

audio_tools.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from langchain_core.tools import tool
+from pydub import AudioSegment
+from pyAudioAnalysis import audioSegmentation as aS
+import base64
+from io import BytesIO
+@tool
+def audio_to_base64(file_path: str) -> str:
+    """
+    Convert an audio file to base64 format
+    Args:
+        file_path: Path to the audio file
+    Returns:
+        The audio file in base64 format
+    """
+    # Load the audio file
+    audio = AudioSegment.from_file(file_path)
+    # Export the audio to a BytesIO object
+    buffer = BytesIO()
+    audio.export(buffer, format="wav")  # You can change the format if needed
+    # Encode the audio data to base64
+    audio_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return audio_base64
+@tool
+def noise_reduction(audio: str) -> str:
+    """
+    Reduce noise from an audio file
+    Args:
+        audio: The audio file in base64 format
+    Returns:
+        The denoised audio file in base64 format
+    """
+    # Decode the base64 audio
+    audio_data = base64.b64decode(audio)
+    audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+    # Apply noise reduction (simple example using low-pass filter)
+    denoised_audio = audio_segment.low_pass_filter(3000)
+    # Encode back to base64
+    buffer = BytesIO()
+    denoised_audio.export(buffer, format="wav")
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+@tool
+def audio_segmentation(audio: str, segment_length: int = 30) -> list:
+    """
+    Segment an audio file into smaller chunks
+    Args:
+        audio: The audio file in base64 format
+        segment_length: Length of each segment in seconds
+    Returns:
+        List of audio segments in base64 format
+    """
+    # Decode the base64 audio
+    audio_data = base64.b64decode(audio)
+    audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+    # Segment the audio
+    segments = []
+    for i in range(0, len(audio_segment), segment_length * 1000):
+        segment = audio_segment[i:i + segment_length * 1000]
+        buffer = BytesIO()
+        segment.export(buffer, format="wav")
+        segments.append(base64.b64encode(buffer.getvalue()).decode('utf-8'))
+    return segments
+@tool
+def speaker_diarization(audio: str) -> list:
+    """
+    Diarize an audio file into speakers
+    Args:
+        audio: The audio file in base64 format
+    Returns:
+        List of speaker segments
+    """
+    # Decode the base64 audio
+    audio_data = base64.b64decode(audio)
+    audio_path = "temp_audio.wav"
+    with open(audio_path, "wb") as f:
+        f.write(audio_data)
+    # Perform speaker diarization
+    [flags, classes, centers] = aS.speakerDiarization(audio_path, 2)  # Assuming 2 speakers
+    # Process the output
+    speaker_segments = []
+    for i, flag in enumerate(flags):
+        speaker_segments.append((i, flag))
+    return speaker_segments

mini_agents.py CHANGED Viewed

@@ -1,17 +1,17 @@
 from smolagents import CodeAgent, InferenceClientModel
-from tools import sort_list
 import os
 MODEL_CHOICES = {
     "audio": ["whisper-large-v3"],
-    "vlm": ["Salesforce/blip-image-captioning-base", "smolvlm/vlm-base-patch14-224"],
-    "code": ["gpt-4o-mini"]}
-code_agent = CodeAgent(
-    model=MODEL_CHOICES["code"][0],
-    tools=[sort_list],
-    verbose=True
-)
 audio_model = InferenceClientModel(
     model=MODEL_CHOICES["audio"][0],
@@ -21,8 +21,11 @@ audio_model = InferenceClientModel(
 audio_agent = CodeAgent(
     model=audio_model,
-    tools=[],
-    verbose=True
 )
 vlm_model = InferenceClientModel(
@@ -33,13 +36,72 @@ vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
-    tools=[],
-    verbose=True
 )

 from smolagents import CodeAgent, InferenceClientModel
+from tools import sort_list, operate_two_numbers, convert_number
+from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
+from vlm_tools import download_image, image_processing, object_detection, ocr_scan
+from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 import os
 MODEL_CHOICES = {
     "audio": ["whisper-large-v3"],
+    "vlm": ["Qwen/Qwen2.5-VL-7B-Instruct"],
+    "code": ["Qwen/Qwen2.5-Coder-32B-Instruct"],
+    "arithmetic": ["Qwen/Qwen2.5-Coder-7B-Instruct"],
+    "pandas": ["Qwen/Qwen2.5-Coder-7B-Instruct"]
+}
 audio_model = InferenceClientModel(
     model=MODEL_CHOICES["audio"][0],
 audio_agent = CodeAgent(
     model=audio_model,
+    tools=[audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
+    verbose=True,
+    max_steps=4,
+    name="Audio Agent",
+    description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it."
 )
 vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
+    tools=[download_image, image_processing, object_detection, ocr_scan],
+    verbose=True,
+    max_steps=4,
+    name="VLM Agent",
+    description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
 )
+arithmetic_model = InferenceClientModel(
+    model=MODEL_CHOICES["arithmetic"][0],
+    api_key=os.getenv("HUGGINGFACE_API_KEY"),
+    api_url="https://api.openai.com/v1/chat/completions"
+)
+arithmetic_agent = CodeAgent(
+    model=arithmetic_model,
+    tools=[operate_two_numbers, convert_number],
+    verbose=True,
+    max_steps=4,
+    name="Arithmetic Agent",
+    description="This agent is responsible for performing arithmetic operations on two numbers."
+)
+pandas_model = InferenceClientModel(
+    model=MODEL_CHOICES["pandas"][0],
+    api_key=os.getenv("HUGGINGFACE_API_KEY"),
+    api_url="https://api.openai.com/v1/chat/completions"
+)
+pandas_agent = CodeAgent(
+    model=pandas_model,
+    tools=[to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
+    verbose=True,
+    max_steps=4,
+    name="Pandas Agent",
+    description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
+)
+multimodal_manager = CodeAgent(
+    model=MODEL_CHOICES["code"][0],
+    managed_agents=[audio_agent, vlm_agent],
+    tools=[sort_list],
+    verbose=True,
+    max_steps=8,
+    planning_steps=4,
+    name="Multimodal Manager",
+    description="This agent is responsible for managing the audio and vlm agents."
+)
+operation_manager = CodeAgent(
+    model=MODEL_CHOICES["code"][0],
+    managed_agents=[arithmetic_agent, pandas_agent],
+    tools=[sort_list],
+    verbose=True,
+    max_steps=8,
+    planning_steps=4,
+    name="Operation Manager",
+    description="This agent is responsible for managing the arithmetic and pandas agents."
+)
+master_agent = CodeAgent(
+    model=MODEL_CHOICES["code"][0],
+    managed_agents=[multimodal_manager, operation_manager],
+    tools=[sort_list],
+    verbose=True,
+    max_steps=16,
+    planning_steps=4,
+    name="Master Agent",
+    description="This agent is responsible for managing the multimodal and operation managers."
+)

tools.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from langchain_core.tools import tool
 from datetime import datetime
-from typing import Literal, List
 from smolagents import WebSearchTool, DuckDuckGoSearchTool, VisitWebpageTool, WikipediaSearchTool
 @tool
-def get_current_time(timezone: str = "America/New_York", format: str = "%Y-%m-%d %H:%M:%S"):
     """
     Get the current time
     Args:
@@ -16,7 +17,7 @@ def get_current_time(timezone: str = "America/New_York", format: str = "%Y-%m-%d
     return datetime.now(timezone).strftime(format)
 @tool
-def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "alphabetize_reverse"]):
     """
     Sort a list in ascending or descending order if the list contains numbers.
     Sort it in alphabetically or alphabetically in reverse order if the list contains strings or mixed types.
@@ -61,3 +62,172 @@ duckduckgo_search_tool = DuckDuckGoSearchTool()
 visit_webpage_tool = VisitWebpageTool()
 wikipedia_search_tool = WikipediaSearchTool()

 from langchain_core.tools import tool
 from datetime import datetime
+from typing import Literal, List, Union
 from smolagents import WebSearchTool, DuckDuckGoSearchTool, VisitWebpageTool, WikipediaSearchTool
+import pandas as pd
 @tool
+def get_current_time(timezone: str = "America/New_York", format: str = "%Y-%m-%d %H:%M:%S")->str:
     """
     Get the current time
     Args:
     return datetime.now(timezone).strftime(format)
 @tool
+def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "alphabetize_reverse"])->List[int]:
     """
     Sort a list in ascending or descending order if the list contains numbers.
     Sort it in alphabetically or alphabetically in reverse order if the list contains strings or mixed types.
 visit_webpage_tool = VisitWebpageTool()
 wikipedia_search_tool = WikipediaSearchTool()
+@tool
+def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
+    """
+    Operate on two numbers
+    Args:
+        num1: The first number to operate on. Must be a float.
+        num2: The second number to operate on. Must be a float.
+        operation: The operation to perform. Must be one of the following:
+            - "add": Add the two numbers
+            - "subtract": Subtract the two numbers
+            - "multiply": Multiply the two numbers
+            - "divide": Divide the two numbers
+            - "power": Raise the first number to the power of the second number
+            - "modulo": Return the remainder of the division of the first number by the second number
+        decimal_places: The number of decimal places to round the result to. Default is 2.
+    Returns:
+        The result of the operation
+    """
+    if operation == "add":
+        return round(num1 + num2, decimal_places)
+    elif operation == "subtract":
+        return round(num1 - num2, decimal_places)
+    elif operation == "multiply":
+        return round(num1 * num2, decimal_places)
+    elif operation == "divide":
+        return round(num1 / num2, decimal_places)
+    elif operation == "power":
+        return round(num1 ** num2, decimal_places)
+    elif operation == "modulo":
+        return round(num1 % num2, decimal_places)
+    else:
+        raise ValueError("operation must be one of the following: add, subtract, multiply, divide, power, modulo")
+@tool
+def convert_number(orig_num: Union[float, int], operation: Literal["to_base", "type_cast"], new_base: Literal["binary", "octal", "hexadecimal", "int", "float"], decimal_places: int = 2)->Union[int, float]:
+    """
+    Convert a number to a new base
+    Args:
+        orig_num: The number to convert. Must be a float or int.
+        operation: The operation to perform. Must be one of the following:
+            - "to_base": Convert the number to a new base.
+            - "type_cast": Convert the number to a new type.
+        new_base: The new base to convert the number to. Must be one of the following:
+            - "binary": Convert the number to binary.
+            - "octal": Convert the number to octal.
+            - "hexadecimal": Convert the number to hexadecimal.
+            - "int": Convert the number to an int.
+            - "float": Convert the number to a float.
+        decimal_places: The number of decimal places to round the result to. Default is 2. Only used if operation is "type_cast" and new_base is "float".
+    Returns:
+        The converted number
+    """
+    if operation == "to_base":
+        if new_base == "binary":
+            return bin(orig_num)
+        elif new_base == "octal":
+            return oct(orig_num)
+        elif new_base == "hexadecimal":
+            return hex(orig_num)
+        else:
+            raise ValueError("new_base must be one of the following: binary, octal, hexadecimal, int, float")
+    elif operation == "type_cast":
+        if new_base == "int":
+            return int(orig_num)
+        elif new_base == "float":
+            return round(float(orig_num), decimal_places)
+        else:
+            raise ValueError("new_base must be one of the following: int, float")
+    else:
+        raise ValueError("operation must be one of the following: to_base, type_cast")
+@tool
+def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
+    """
+    Convert a list of dictionaries to a pandas DataFrame
+    """
+    return pd.DataFrame(data, columns=columns)
+@tool
+def to_json(data: pd.DataFrame)->str:
+    """
+    Convert a pandas DataFrame to a JSON string
+    """
+    return data.to_json(orient="records")
+@tool
+def get_dataframe_data(data: pd.DataFrame, column: Union[str, int], row: Union[str, int])->Union[str, int, float]:
+    """
+    Get a specific cell from a pandas DataFrame
+    Args:
+        data: The pandas DataFrame to get the data from.
+        column: The column to get the data from. Must be a string or int. If int then it is the index of the column.
+        row: The row to get the data from. Must be a string or int. If int then it is the index of the row.
+    Returns:
+        The data from the specified cell
+    """
+    if isinstance(column, int):
+        column = data.iloc[:, column]
+    if isinstance(row, int):
+        row = data.iloc[row, :]
+    return data.loc[row, column]
+@tool
+def get_dataframe_column(data: pd.DataFrame, column: Union[str, int])->pd.Series:
+    """
+    Get a specific column from a pandas DataFrame
+    Args:
+        data: The pandas DataFrame to get the column from.
+        column: The column to get the data from. Must be a string or int. If int then it is the index of the column.
+    Returns:
+        The data from the specified column
+    """
+    return data.iloc[:, column]
+@tool
+def get_dataframe_row(data: pd.DataFrame, row: Union[str, int])->pd.Series:
+    """
+    Get a specific row from a pandas DataFrame
+    Args:
+        data: The pandas DataFrame to get the row from.
+        row: The row to get the data from. Must be a string or int. If int then it is the index of the row.
+    Returns:
+        The data from the specified row
+    """
+    return data.iloc[row, :]
+@tool
+def get_dataframe_groupby(data: pd.DataFrame, column: str, operation: Literal["mean", "sum", "count", "min", "max", "median", "std", "var"])->pd.DataFrame:
+    """
+    Group a pandas DataFrame by a specific column and perform an operation on the grouped data
+    Args:
+        data: The pandas DataFrame to group.
+        column: The column to group the data by.
+        operation: The operation to perform on the grouped data. Must be one of the following:
+            - "mean": Calculate the mean of the grouped data.
+            - "sum": Calculate the sum of the grouped data.
+            - "count": Count the number of rows in the grouped data.
+            - "min": Calculate the minimum of the grouped data.
+            - "max": Calculate the maximum of the grouped data.
+            - "median": Calculate the median of the grouped data.
+            - "std": Calculate the standard deviation of the grouped data.
+            - "var": Calculate the variance of the grouped data.
+    Returns:
+        The grouped data
+    """
+    if operation == "mean":
+        return data.groupby(column).mean()
+    elif operation == "sum":
+        return data.groupby(column).sum()
+    elif operation == "count":
+        return data.groupby(column).count()
+    elif operation == "min":
+        return data.groupby(column).min()
+    elif operation == "max":
+        return data.groupby(column).max()
+    elif operation == "median":
+        return data.groupby(column).median()
+    elif operation == "std":
+        return data.groupby(column).std()
+    elif operation == "var":
+        return data.groupby(column).var()
+    else:
+        raise ValueError("operation must be one of the following: mean, sum, count, min, max, median, std, var")

vlm_tools.py CHANGED Viewed

@@ -1,3 +1,10 @@
 from langchain_core.tools import tool
 @tool
@@ -9,32 +16,95 @@ def download_image(image_url: str):
     Returns:
         The image as a base64 string
     """
-    #download the image into a base64 string
-    image = None
     return image
 @tool
-def image_processing(image: str):
     """
     Process an image
     Args:
         image: The image in base64 format to process
     Returns:
         The processed image
     """
-    processed_image = None
     return processed_image
 @tool
-def object_detection(image: str):
     """
     Detect objects in an image
     Args:
         image: The image in base64 format to detect objects in
     Returns:
         The detected objects
     """
-    detected_objects = None
     return detected_objects
 @tool
@@ -46,7 +116,9 @@ def ocr_scan(image: str):
     Returns:
         The text in the image
     """
-    scanned_text = None
     return scanned_text

+import cv2
+import numpy as np
+import pytesseract
+import requests
+import base64
+from io import BytesIO
+from PIL import Image
 from langchain_core.tools import tool
 @tool
     Returns:
         The image as a base64 string
     """
+    response = requests.get(image_url)
+    image = base64.b64encode(response.content).decode('utf-8')
     return image
 @tool
+def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0):
     """
     Process an image
     Args:
         image: The image in base64 format to process
+        brightness: The brightness of the image on scale of 0-10
+        contrast: The contrast of the image on scale of 0-10
     Returns:
         The processed image
     """
+    image_data = base64.b64decode(image)
+    np_image = np.frombuffer(image_data, np.uint8)
+    img = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
+    # Adjust brightness and contrast
+    img = cv2.convertScaleAbs(img, alpha=contrast, beta=brightness)
+    _, buffer = cv2.imencode('.jpg', img)
+    processed_image = base64.b64encode(buffer).decode('utf-8')
     return processed_image
+weights_path = "vlm_assets/yolo11n.weights"
+config_path = "vlm_assets/yolo11n.cfg"
+names_path = "vlm_assets/obj.names"
 @tool
+def object_detection(image: str, weights_path: str = weights_path, config_path: str = config_path, names_path: str = names_path):
     """
     Detect objects in an image
     Args:
         image: The image in base64 format to detect objects in
+        weights_path: The path to the weights file
+        config_path: The path to the config file
+        names_path: The path to the names file
     Returns:
         The detected objects
     """
+    image_data = base64.b64decode(image)
+    np_image = np.frombuffer(image_data, np.uint8)
+    img = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
+    # Load YOLO
+    net = cv2.dnn.readNet(weights_path, config_path)
+    layer_names = net.getLayerNames()
+    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
+    # Load class labels
+    with open(names_path, 'r') as f:
+        classes = [line.strip() for line in f.readlines()]
+    # Detect objects
+    blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
+    net.setInput(blob)
+    outs = net.forward(output_layers)
+    # Process detections
+    class_ids = []
+    confidences = []
+    boxes = []
+    for out in outs:
+        for detection in out:
+            scores = detection[5:]
+            class_id = np.argmax(scores)
+            confidence = scores[class_id]
+            if confidence > 0.5:
+                center_x = int(detection[0] * img.shape[1])
+                center_y = int(detection[1] * img.shape[0])
+                w = int(detection[2] * img.shape[1])
+                h = int(detection[3] * img.shape[0])
+                x = int(center_x - w / 2)
+                y = int(center_y - h / 2)
+                boxes.append([x, y, w, h])
+                confidences.append(float(confidence))
+                class_ids.append(class_id)
+    # Apply non-max suppression
+    indices = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
+    detected_objects = []
+    for i in indices:
+        i = i[0]
+        box = boxes[i]
+        label = str(classes[class_ids[i]])
+        detected_objects.append((label, confidences[i], box))
     return detected_objects
 @tool
     Returns:
         The text in the image
     """
+    image_data = base64.b64decode(image)
+    img = Image.open(BytesIO(image_data))
+    scanned_text = pytesseract.image_to_string(img)
     return scanned_text