Final_Assignment_Template

Sleeping

huytofu92 commited on May 16

Commit

29cd08b

1 Parent(s): b120423

Fix tools desc

Files changed (2) hide show

mini_agents.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from smolagents import CodeAgent, InferenceClientModel
 from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
-from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan, extract_frames_from_video
 from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools
 import os
@@ -40,7 +40,7 @@ vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
-    tools=[download_image, image_processing, object_detection_tool, ocr_scan, extract_frames_from_video],
     max_steps=4,
     name="vlm_agent",
     description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."

 from smolagents import CodeAgent, InferenceClientModel
 from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
+from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video
 from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools
 import os
 vlm_agent = CodeAgent(
     model=vlm_model,
+    tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video],
     max_steps=4,
     name="vlm_agent",
     description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."

vlm_tools.py CHANGED Viewed

@@ -145,7 +145,7 @@ class ObjectDetectionTool(Tool):
         self.names_path = names_path
         self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
-    def forward(self, frames: List[str])->List[List[str]]:
         # Load class labels
         with open(self.names_path, 'r') as f:
             classes = [line.strip() for line in f.readlines()]
@@ -163,22 +163,23 @@ class ObjectDetectionTool(Tool):
         return detected_objects
-@tool
-def ocr_scan(frames: List[str])->List[List[str]]:
-    """
-    Scan an image for text
-    Args:
-        frames: The list of frames (images) to scan for text
-    Returns:
-        The list of text in the images
-    """
-    scanned_text = []
-    for frame in frames:
-        image_data = base64.b64decode(frame)
-        img = Image.open(BytesIO(image_data))
-        scanned_text.append(pytesseract.image_to_string(img))
-    return scanned_text
 object_detection_tool = ObjectDetectionTool()

         self.names_path = names_path
         self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
+    def forward(self, frames: any)->any:
         # Load class labels
         with open(self.names_path, 'r') as f:
             classes = [line.strip() for line in f.readlines()]
         return detected_objects
+class OCRTool(Tool):
+    description = "Scan an image for text. It takes a list of frames (images) as input and returns a list of text in the images."
+    name = "ocr_scan"
+    inputs = {
+        "frames": {"type": "List[str]", "description": "The list of frames (images) to scan for text"}
+    }
+    output_type = "List[List[str]]"
+    def forward(self, frames: any)->any:
+        scanned_text = []
+        for frame in frames:
+            image_data = base64.b64decode(frame)
+            img = Image.open(BytesIO(image_data))
+            scanned_text.append(pytesseract.image_to_string(img))
+        return scanned_text
+ocr_scan_tool = OCRTool()
 object_detection_tool = ObjectDetectionTool()