huytofu92 commited on
Commit
a2cf089
·
1 Parent(s): 24b4228

Add pandas tool and modify vlm tools

Browse files
Files changed (3) hide show
  1. mini_agents.py +4 -4
  2. tools.py +11 -0
  3. vlm_tools.py +41 -32
mini_agents.py CHANGED
@@ -1,7 +1,7 @@
1
  from smolagents import CodeAgent, InferenceClientModel
2
- from tools import sort_list, operate_two_numbers, convert_number
3
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
4
- from vlm_tools import download_image, image_processing, object_detection, ocr_scan
5
  from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
6
  from community_tools import community_tools
7
  import os
@@ -40,7 +40,7 @@ vlm_model = InferenceClientModel(
40
 
41
  vlm_agent = CodeAgent(
42
  model=vlm_model,
43
- tools=[download_image, image_processing, object_detection, ocr_scan],
44
  max_steps=4,
45
  name="vlm_agent",
46
  description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
@@ -68,7 +68,7 @@ pandas_model = InferenceClientModel(
68
 
69
  pandas_agent = CodeAgent(
70
  model=pandas_model,
71
- tools=[to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
72
  max_steps=4,
73
  name="pandas_agent",
74
  description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
 
1
  from smolagents import CodeAgent, InferenceClientModel
2
+ from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
3
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
4
+ from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan
5
  from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
6
  from community_tools import community_tools
7
  import os
 
40
 
41
  vlm_agent = CodeAgent(
42
  model=vlm_model,
43
+ tools=[download_image, image_processing, object_detection_tool, ocr_scan],
44
  max_steps=4,
45
  name="vlm_agent",
46
  description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
 
68
 
69
  pandas_agent = CodeAgent(
70
  model=pandas_model,
71
+ tools=[load_dataframe_from_csv, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
72
  max_steps=4,
73
  name="pandas_agent",
74
  description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
tools.py CHANGED
@@ -134,6 +134,17 @@ def convert_number(orig_num: any, operation: Literal["to_base", "type_cast"], ne
134
  else:
135
  raise ValueError("operation must be one of the following: to_base, type_cast")
136
 
 
 
 
 
 
 
 
 
 
 
 
137
  @tool
138
  def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
139
  """
 
134
  else:
135
  raise ValueError("operation must be one of the following: to_base, type_cast")
136
 
137
+ @tool
138
+ def load_dataframe_from_csv(file_path: str)->pd.DataFrame:
139
+ """
140
+ Load a pandas DataFrame from a CSV file
141
+ Args:
142
+ file_path: The path to the CSV file to load.
143
+ Returns:
144
+ The pandas DataFrame
145
+ """
146
+ return pd.read_csv(file_path)
147
+
148
  @tool
149
  def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
150
  """
vlm_tools.py CHANGED
@@ -8,6 +8,7 @@ from io import BytesIO
8
  from PIL import Image
9
  from langchain_core.tools import tool as langchain_tool
10
  from smolagents.tools import Tool, tool
 
11
 
12
  def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
13
 
@@ -109,49 +110,57 @@ def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)
109
  onnx_path = "vlm_assets/yolov3-8.onnx"
110
  names_path = "vlm_assets/obj.names"
111
 
112
- @tool
113
- def object_detection(image: str, onnx_path: str = onnx_path, names_path: str = names_path)->list:
114
- """
115
- Detect objects in an image
116
- Args:
117
- image: The image in base64 format to detect objects in
118
- onnx_path: The path to the onnx file
119
- names_path: The path to the names file
120
- Returns:
121
- The detected objects
122
- """
123
- img = pre_processing(image)
124
-
125
- # Load ONNX model
126
- onnx_model = onnxruntime.InferenceSession(onnx_path)
127
-
128
- # Load class labels
129
- with open(names_path, 'r') as f:
130
- classes = [line.strip() for line in f.readlines()]
131
-
132
- # Preprocess the image
133
- blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
134
- onnx_input = {onnx_model.get_inputs()[0].name: blob}
135
- onnx_output = onnx_model.run(None, onnx_input)
 
 
 
 
 
136
 
137
- detected_objects = post_processing(onnx_output, classes, img.shape)
138
 
139
- return detected_objects
140
 
141
  @tool
142
- def ocr_scan(image: str)->str:
143
  """
144
  Scan an image for text
145
  Args:
146
- image: The image in base64 format to scan for text
147
  Returns:
148
- The text in the image
149
  """
150
- image_data = base64.b64decode(image)
151
- img = Image.open(BytesIO(image_data))
152
- scanned_text = pytesseract.image_to_string(img)
 
 
153
  return scanned_text
154
 
 
155
 
156
 
157
 
 
8
  from PIL import Image
9
  from langchain_core.tools import tool as langchain_tool
10
  from smolagents.tools import Tool, tool
11
+ from typing import List
12
 
13
  def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
14
 
 
110
  onnx_path = "vlm_assets/yolov3-8.onnx"
111
  names_path = "vlm_assets/obj.names"
112
 
113
+ class ObjectDetectionTool(Tool):
114
+ description = "Detect objects in a list of frames (images)"
115
+ name = "object_detection"
116
+ inputs = {
117
+ "frames": {"type": List[str], "description": "The list of frames (images) to detect objects in"},
118
+ "onnx_path": {"type": "string", "description": "The path to the onnx file"},
119
+ "names_path": {"type": "string", "description": "The path to the names file"}
120
+ }
121
+ output_type = List[List[str]]
122
+
123
+ def setup(self):
124
+ # Load ONNX model
125
+ self.onnx_path = onnx_path
126
+ self.names_path = names_path
127
+ self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
128
+
129
+ def forward(self, frames: List[str])->List[List[str]]:
130
+ # Load class labels
131
+ with open(self.names_path, 'r') as f:
132
+ classes = [line.strip() for line in f.readlines()]
133
+
134
+ detected_objects = []
135
+ for frame in frames:
136
+ img = pre_processing(frame)
137
+
138
+ # Preprocess the image
139
+ blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
140
+ onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
141
+ onnx_output = self.onnx_model.run(None, onnx_input)
142
 
143
+ detected_objects.append(post_processing(onnx_output, classes, img.shape))
144
 
145
+ return detected_objects
146
 
147
  @tool
148
+ def ocr_scan(frames: List[str])->List[List[str]]:
149
  """
150
  Scan an image for text
151
  Args:
152
+ frames: The list of frames (images) to scan for text
153
  Returns:
154
+ The list of text in the images
155
  """
156
+ scanned_text = []
157
+ for frame in frames:
158
+ image_data = base64.b64decode(frame)
159
+ img = Image.open(BytesIO(image_data))
160
+ scanned_text.append(pytesseract.image_to_string(img))
161
  return scanned_text
162
 
163
+ object_detection_tool = ObjectDetectionTool()
164
 
165
 
166