huytofu92 commited on
Commit
5b6dc13
·
1 Parent(s): 33b9b1f

new requirements and enhance tools

Browse files
Files changed (3) hide show
  1. audio_tools.py +2 -9
  2. requirements.txt +7 -1
  3. vlm_tools.py +102 -32
audio_tools.py CHANGED
@@ -27,21 +27,14 @@ class TranscribeAudioTool(Tool):
27
  transcribe_audio_tool = TranscribeAudioTool()
28
 
29
  @tool
30
- def audio_to_base64(file_path_or_key: str, state: dict) -> str:
31
  """
32
  Convert an audio file to base64 format
33
  Args:
34
- file_path_or_key: Path to the audio file or a key in the state dictionary
35
- state: The state dictionary containing file paths
36
  Returns:
37
  The audio file in base64 format
38
  """
39
- # Check if the input is a key in the state dictionary
40
- if file_path_or_key in state:
41
- file_path = state[file_path_or_key]
42
- else:
43
- file_path = file_path_or_key
44
-
45
  # Load the audio file
46
  try:
47
  audio = AudioSegment.from_file(file_path)
 
27
  transcribe_audio_tool = TranscribeAudioTool()
28
 
29
  @tool
30
+ def audio_to_base64(file_path: str) -> str:
31
  """
32
  Convert an audio file to base64 format
33
  Args:
34
+ file_path: Path to the audio file
 
35
  Returns:
36
  The audio file in base64 format
37
  """
 
 
 
 
 
 
38
  # Load the audio file
39
  try:
40
  audio = AudioSegment.from_file(file_path)
requirements.txt CHANGED
@@ -7,6 +7,7 @@ anyio==4.9.0
7
  arxiv==2.2.0
8
  attrs==25.3.0
9
  beautifulsoup4==4.13.4
 
10
  cachetools==5.5.2
11
  certifi==2025.4.26
12
  charset-normalizer==3.4.2
@@ -18,6 +19,7 @@ cycler==0.12.1
18
  dataclasses-json==0.6.7
19
  defusedxml==0.7.1
20
  deprecation==2.1.0
 
21
  duckduckgo_search==8.0.2
22
  eyeD3==0.9.8
23
  fastapi==0.115.12
@@ -28,7 +30,7 @@ filetype==1.2.0
28
  flatbuffers==25.2.10
29
  fonttools==4.58.0
30
  frozenlist==1.6.0
31
- fsspec==2025.3.2
32
  google-api-core==2.24.2
33
  google-api-python-client==2.169.0
34
  google-auth==2.40.1
@@ -72,6 +74,7 @@ matplotlib==3.10.3
72
  mdurl==0.1.2
73
  mpmath==1.3.0
74
  multidict==6.4.3
 
75
  mypy_extensions==1.1.0
76
  narwhals==1.39.1
77
  numpy==2.2.5
@@ -86,6 +89,7 @@ primp==0.15.0
86
  propcache==0.3.1
87
  proto-plus==1.26.1
88
  protobuf==6.31.0
 
89
  pyasn1==0.6.1
90
  pyasn1_modules==0.4.2
91
  pyAudioAnalysis==0.3.14
@@ -137,6 +141,8 @@ uvicorn==0.34.2
137
  websockets==15.0.1
138
  wikipedia==1.4.0
139
  Wikipedia-API==0.8.1
 
140
  yarl==1.20.0
 
141
  youtube-transcript-api==1.0.3
142
  zstandard==0.23.0
 
7
  arxiv==2.2.0
8
  attrs==25.3.0
9
  beautifulsoup4==4.13.4
10
+ bs4==0.0.2
11
  cachetools==5.5.2
12
  certifi==2025.4.26
13
  charset-normalizer==3.4.2
 
19
  dataclasses-json==0.6.7
20
  defusedxml==0.7.1
21
  deprecation==2.1.0
22
+ dill==0.3.8
23
  duckduckgo_search==8.0.2
24
  eyeD3==0.9.8
25
  fastapi==0.115.12
 
30
  flatbuffers==25.2.10
31
  fonttools==4.58.0
32
  frozenlist==1.6.0
33
+ fsspec==2025.3.0
34
  google-api-core==2.24.2
35
  google-api-python-client==2.169.0
36
  google-auth==2.40.1
 
74
  mdurl==0.1.2
75
  mpmath==1.3.0
76
  multidict==6.4.3
77
+ multiprocess==0.70.16
78
  mypy_extensions==1.1.0
79
  narwhals==1.39.1
80
  numpy==2.2.5
 
89
  propcache==0.3.1
90
  proto-plus==1.26.1
91
  protobuf==6.31.0
92
+ pyarrow==20.0.0
93
  pyasn1==0.6.1
94
  pyasn1_modules==0.4.2
95
  pyAudioAnalysis==0.3.14
 
141
  websockets==15.0.1
142
  wikipedia==1.4.0
143
  Wikipedia-API==0.8.1
144
+ xxhash==3.5.0
145
  yarl==1.20.0
146
+ youtube-dl==2021.12.17
147
  youtube-transcript-api==1.0.3
148
  zstandard==0.23.0
vlm_tools.py CHANGED
@@ -73,39 +73,26 @@ def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nm
73
  return detected_objects
74
 
75
  @tool
76
- def extract_frames_from_video(video_path: str) -> list:
77
  """
78
- Extract frames from a video
79
  Args:
80
  video_path: The path to the video file
81
  Returns:
82
- A list of frames as numpy arrays
83
  """
84
  cap = cv2.VideoCapture(video_path)
85
- frames = []
86
  while cap.isOpened():
87
- ret, frame = cap.read()
88
  if not ret:
89
  break
90
- frames.append(frame)
91
  cap.release()
92
- return frames
93
 
94
  @tool
95
- def download_image(image_url: str)->str:
96
- """
97
- Download an image from a url
98
- Args:
99
- image_url: The url of the image to download
100
- Returns:
101
- The image as a base64 string
102
- """
103
- response = requests.get(image_url)
104
- image = base64.b64encode(response.content).decode('utf-8')
105
- return image
106
-
107
- @tool
108
- def get_image_from_file(file_path: str)->str:
109
  """
110
  Get an image from a file
111
  Args:
@@ -138,6 +125,89 @@ def get_image_from_file(file_path: str)->str:
138
  image = base64.b64encode(buffer.getvalue()).decode('utf-8')
139
  return image
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  @tool
142
  def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
143
  """
@@ -166,13 +236,13 @@ names_path = "vlm_assets/obj.names"
166
  class ObjectDetectionTool(Tool):
167
  name = "object_detection"
168
  description = """
169
- Detect objects in a list of frames (images).
170
- It takes a list of frames (images) as input and returns
171
  a list of detected objects with labels, confidence, and bounding boxes.
172
  The output type will be List[List[str]]
173
  """
174
  inputs = {
175
- "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
176
  }
177
  output_type = "any"
178
 
@@ -197,10 +267,10 @@ class ObjectDetectionTool(Tool):
197
  'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
198
  ]
199
 
200
- def forward(self, frames: any)->any:
201
  detected_objects = []
202
- for frame in frames:
203
- img = pre_processing(frame)
204
 
205
  # Preprocess the image
206
  blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
@@ -214,20 +284,20 @@ class ObjectDetectionTool(Tool):
214
  class OCRTool(Tool):
215
  description = """
216
  Scan an image for text.
217
- It takes a list of frames (images) as input and returns
218
  a list of text in the images.
219
  The output type will be List[List[str]]
220
  """
221
  name = "ocr_scan"
222
  inputs = {
223
- "frames": {"type": "any", "description": "The list of frames (images) to scan for text. Must be a List[str] or a List[np.ndarray]"}
224
  }
225
  output_type = "any"
226
 
227
- def forward(self, frames: any)->any:
228
  scanned_text = []
229
- for frame in frames:
230
- image_data = base64.b64decode(frame)
231
  img = Image.open(BytesIO(image_data))
232
  scanned_text.append(pytesseract.image_to_string(img))
233
  return scanned_text
 
73
  return detected_objects
74
 
75
  @tool
76
+ def extract_images_from_video(video_path: str) -> list:
77
  """
78
+ Extract images (frames) from a video
79
  Args:
80
  video_path: The path to the video file
81
  Returns:
82
+ A list of images (frames) as numpy arrays
83
  """
84
  cap = cv2.VideoCapture(video_path)
85
+ images = []
86
  while cap.isOpened():
87
+ ret, image = cap.read()
88
  if not ret:
89
  break
90
+ images.append(image)
91
  cap.release()
92
+ return images
93
 
94
  @tool
95
+ def get_image_from_file_path(file_path: str)->str:
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  """
97
  Get an image from a file
98
  Args:
 
125
  image = base64.b64encode(buffer.getvalue()).decode('utf-8')
126
  return image
127
 
128
+ @tool
129
+ def get_video_from_file_path(file_path: str)->str:
130
+ """
131
+ Get a video from a file using cv2 and BytesIO
132
+ Args:
133
+ file_path: The path to the file
134
+ Returns:
135
+ The video as a base64 string
136
+ """
137
+ try:
138
+ # Use cv2 to read the video
139
+ cap = cv2.VideoCapture(file_path)
140
+ if not cap.isOpened():
141
+ raise FileNotFoundError(f"Could not read video at {file_path}")
142
+
143
+ # Get video properties
144
+ fps = cap.get(cv2.CAP_PROP_FPS)
145
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
146
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
147
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
148
+
149
+ # Create a BytesIO buffer to store the images (frames)
150
+ images = []
151
+ while cap.isOpened():
152
+ ret, image = cap.read()
153
+ if not ret:
154
+ break
155
+ # Convert frame to jpg and store in memory
156
+ _, buffer = cv2.imencode('.jpg', image)
157
+ images.append(buffer.tobytes())
158
+
159
+ # Release the video capture
160
+ cap.release()
161
+
162
+ # Combine all images into a single buffer
163
+ with BytesIO() as buffer:
164
+ # Write each image to the buffer
165
+ for image_data in images:
166
+ buffer.write(image_data)
167
+
168
+ # Encode to base64
169
+ video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
170
+
171
+ except Exception as e:
172
+ current_file_path = os.path.abspath(__file__)
173
+ current_file_dir = os.path.dirname(current_file_path)
174
+ file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
175
+
176
+ # Try again with the new path
177
+ cap = cv2.VideoCapture(file_path)
178
+ if not cap.isOpened():
179
+ raise FileNotFoundError(f"Could not read video at {file_path}")
180
+
181
+ # Get video properties
182
+ fps = cap.get(cv2.CAP_PROP_FPS)
183
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
184
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
185
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
186
+
187
+ # Create a BytesIO buffer to store the images (frames)
188
+ images = []
189
+ while cap.isOpened():
190
+ ret, image = cap.read()
191
+ if not ret:
192
+ break
193
+ # Convert image to jpg and store in memory
194
+ _, buffer = cv2.imencode('.jpg', image)
195
+ images.append(buffer.tobytes())
196
+
197
+ # Release the video capture
198
+ cap.release()
199
+
200
+ # Combine all images into a single buffer
201
+ with BytesIO() as buffer:
202
+ # Write each image to the buffer
203
+ for image_data in images:
204
+ buffer.write(image_data)
205
+
206
+ # Encode to base64
207
+ video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
208
+
209
+ return video_base64
210
+
211
  @tool
212
  def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
213
  """
 
236
  class ObjectDetectionTool(Tool):
237
  name = "object_detection"
238
  description = """
239
+ Detect objects in a list of images.
240
+ It takes a list of images as input and returns
241
  a list of detected objects with labels, confidence, and bounding boxes.
242
  The output type will be List[List[str]]
243
  """
244
  inputs = {
245
+ "images": {"type": "any", "description": "The list of images to detect objects in. Must be a List[str] or a List[np.ndarray]"}
246
  }
247
  output_type = "any"
248
 
 
267
  'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
268
  ]
269
 
270
+ def forward(self, images: any)->any:
271
  detected_objects = []
272
+ for image in images:
273
+ img = pre_processing(image)
274
 
275
  # Preprocess the image
276
  blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
 
284
  class OCRTool(Tool):
285
  description = """
286
  Scan an image for text.
287
+ It takes a list of images as input and returns
288
  a list of text in the images.
289
  The output type will be List[List[str]]
290
  """
291
  name = "ocr_scan"
292
  inputs = {
293
+ "images": {"type": "any", "description": "The list of images to scan for text. Must be a List[str] or a List[np.ndarray]"}
294
  }
295
  output_type = "any"
296
 
297
+ def forward(self, images: any)->any:
298
  scanned_text = []
299
+ for image in images:
300
+ image_data = base64.b64decode(image)
301
  img = Image.open(BytesIO(image_data))
302
  scanned_text.append(pytesseract.image_to_string(img))
303
  return scanned_text