Spaces:
Sleeping
Sleeping
Add pandas tool and modify vlm tools
Browse files- mini_agents.py +4 -4
- tools.py +11 -0
- vlm_tools.py +41 -32
mini_agents.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from smolagents import CodeAgent, InferenceClientModel
|
2 |
-
from tools import sort_list, operate_two_numbers, convert_number
|
3 |
from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
|
4 |
-
from vlm_tools import download_image, image_processing,
|
5 |
from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
|
6 |
from community_tools import community_tools
|
7 |
import os
|
@@ -40,7 +40,7 @@ vlm_model = InferenceClientModel(
|
|
40 |
|
41 |
vlm_agent = CodeAgent(
|
42 |
model=vlm_model,
|
43 |
-
tools=[download_image, image_processing,
|
44 |
max_steps=4,
|
45 |
name="vlm_agent",
|
46 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
@@ -68,7 +68,7 @@ pandas_model = InferenceClientModel(
|
|
68 |
|
69 |
pandas_agent = CodeAgent(
|
70 |
model=pandas_model,
|
71 |
-
tools=[to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
|
72 |
max_steps=4,
|
73 |
name="pandas_agent",
|
74 |
description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
|
|
|
1 |
from smolagents import CodeAgent, InferenceClientModel
|
2 |
+
from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
|
3 |
from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
|
4 |
+
from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan
|
5 |
from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
|
6 |
from community_tools import community_tools
|
7 |
import os
|
|
|
40 |
|
41 |
vlm_agent = CodeAgent(
|
42 |
model=vlm_model,
|
43 |
+
tools=[download_image, image_processing, object_detection_tool, ocr_scan],
|
44 |
max_steps=4,
|
45 |
name="vlm_agent",
|
46 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
|
|
68 |
|
69 |
pandas_agent = CodeAgent(
|
70 |
model=pandas_model,
|
71 |
+
tools=[load_dataframe_from_csv, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
|
72 |
max_steps=4,
|
73 |
name="pandas_agent",
|
74 |
description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
|
tools.py
CHANGED
@@ -134,6 +134,17 @@ def convert_number(orig_num: any, operation: Literal["to_base", "type_cast"], ne
|
|
134 |
else:
|
135 |
raise ValueError("operation must be one of the following: to_base, type_cast")
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
@tool
|
138 |
def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
|
139 |
"""
|
|
|
134 |
else:
|
135 |
raise ValueError("operation must be one of the following: to_base, type_cast")
|
136 |
|
137 |
+
@tool
|
138 |
+
def load_dataframe_from_csv(file_path: str)->pd.DataFrame:
|
139 |
+
"""
|
140 |
+
Load a pandas DataFrame from a CSV file
|
141 |
+
Args:
|
142 |
+
file_path: The path to the CSV file to load.
|
143 |
+
Returns:
|
144 |
+
The pandas DataFrame
|
145 |
+
"""
|
146 |
+
return pd.read_csv(file_path)
|
147 |
+
|
148 |
@tool
|
149 |
def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
|
150 |
"""
|
vlm_tools.py
CHANGED
@@ -8,6 +8,7 @@ from io import BytesIO
|
|
8 |
from PIL import Image
|
9 |
from langchain_core.tools import tool as langchain_tool
|
10 |
from smolagents.tools import Tool, tool
|
|
|
11 |
|
12 |
def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
|
13 |
|
@@ -109,49 +110,57 @@ def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)
|
|
109 |
onnx_path = "vlm_assets/yolov3-8.onnx"
|
110 |
names_path = "vlm_assets/obj.names"
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
""
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
|
139 |
-
|
140 |
|
141 |
@tool
|
142 |
-
def ocr_scan(
|
143 |
"""
|
144 |
Scan an image for text
|
145 |
Args:
|
146 |
-
|
147 |
Returns:
|
148 |
-
The text in the
|
149 |
"""
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
153 |
return scanned_text
|
154 |
|
|
|
155 |
|
156 |
|
157 |
|
|
|
8 |
from PIL import Image
|
9 |
from langchain_core.tools import tool as langchain_tool
|
10 |
from smolagents.tools import Tool, tool
|
11 |
+
from typing import List
|
12 |
|
13 |
def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
|
14 |
|
|
|
110 |
onnx_path = "vlm_assets/yolov3-8.onnx"
|
111 |
names_path = "vlm_assets/obj.names"
|
112 |
|
113 |
+
class ObjectDetectionTool(Tool):
|
114 |
+
description = "Detect objects in a list of frames (images)"
|
115 |
+
name = "object_detection"
|
116 |
+
inputs = {
|
117 |
+
"frames": {"type": List[str], "description": "The list of frames (images) to detect objects in"},
|
118 |
+
"onnx_path": {"type": "string", "description": "The path to the onnx file"},
|
119 |
+
"names_path": {"type": "string", "description": "The path to the names file"}
|
120 |
+
}
|
121 |
+
output_type = List[List[str]]
|
122 |
+
|
123 |
+
def setup(self):
|
124 |
+
# Load ONNX model
|
125 |
+
self.onnx_path = onnx_path
|
126 |
+
self.names_path = names_path
|
127 |
+
self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
|
128 |
+
|
129 |
+
def forward(self, frames: List[str])->List[List[str]]:
|
130 |
+
# Load class labels
|
131 |
+
with open(self.names_path, 'r') as f:
|
132 |
+
classes = [line.strip() for line in f.readlines()]
|
133 |
+
|
134 |
+
detected_objects = []
|
135 |
+
for frame in frames:
|
136 |
+
img = pre_processing(frame)
|
137 |
+
|
138 |
+
# Preprocess the image
|
139 |
+
blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
140 |
+
onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
|
141 |
+
onnx_output = self.onnx_model.run(None, onnx_input)
|
142 |
|
143 |
+
detected_objects.append(post_processing(onnx_output, classes, img.shape))
|
144 |
|
145 |
+
return detected_objects
|
146 |
|
147 |
@tool
|
148 |
+
def ocr_scan(frames: List[str])->List[List[str]]:
|
149 |
"""
|
150 |
Scan an image for text
|
151 |
Args:
|
152 |
+
frames: The list of frames (images) to scan for text
|
153 |
Returns:
|
154 |
+
The list of text in the images
|
155 |
"""
|
156 |
+
scanned_text = []
|
157 |
+
for frame in frames:
|
158 |
+
image_data = base64.b64decode(frame)
|
159 |
+
img = Image.open(BytesIO(image_data))
|
160 |
+
scanned_text.append(pytesseract.image_to_string(img))
|
161 |
return scanned_text
|
162 |
|
163 |
+
object_detection_tool = ObjectDetectionTool()
|
164 |
|
165 |
|
166 |
|