Spaces:
Sleeping
Sleeping
new requirements and enhance tools
Browse files- audio_tools.py +2 -9
- requirements.txt +7 -1
- vlm_tools.py +102 -32
audio_tools.py
CHANGED
@@ -27,21 +27,14 @@ class TranscribeAudioTool(Tool):
|
|
27 |
transcribe_audio_tool = TranscribeAudioTool()
|
28 |
|
29 |
@tool
|
30 |
-
def audio_to_base64(
|
31 |
"""
|
32 |
Convert an audio file to base64 format
|
33 |
Args:
|
34 |
-
|
35 |
-
state: The state dictionary containing file paths
|
36 |
Returns:
|
37 |
The audio file in base64 format
|
38 |
"""
|
39 |
-
# Check if the input is a key in the state dictionary
|
40 |
-
if file_path_or_key in state:
|
41 |
-
file_path = state[file_path_or_key]
|
42 |
-
else:
|
43 |
-
file_path = file_path_or_key
|
44 |
-
|
45 |
# Load the audio file
|
46 |
try:
|
47 |
audio = AudioSegment.from_file(file_path)
|
|
|
27 |
transcribe_audio_tool = TranscribeAudioTool()
|
28 |
|
29 |
@tool
|
30 |
+
def audio_to_base64(file_path: str) -> str:
|
31 |
"""
|
32 |
Convert an audio file to base64 format
|
33 |
Args:
|
34 |
+
file_path: Path to the audio file
|
|
|
35 |
Returns:
|
36 |
The audio file in base64 format
|
37 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
# Load the audio file
|
39 |
try:
|
40 |
audio = AudioSegment.from_file(file_path)
|
requirements.txt
CHANGED
@@ -7,6 +7,7 @@ anyio==4.9.0
|
|
7 |
arxiv==2.2.0
|
8 |
attrs==25.3.0
|
9 |
beautifulsoup4==4.13.4
|
|
|
10 |
cachetools==5.5.2
|
11 |
certifi==2025.4.26
|
12 |
charset-normalizer==3.4.2
|
@@ -18,6 +19,7 @@ cycler==0.12.1
|
|
18 |
dataclasses-json==0.6.7
|
19 |
defusedxml==0.7.1
|
20 |
deprecation==2.1.0
|
|
|
21 |
duckduckgo_search==8.0.2
|
22 |
eyeD3==0.9.8
|
23 |
fastapi==0.115.12
|
@@ -28,7 +30,7 @@ filetype==1.2.0
|
|
28 |
flatbuffers==25.2.10
|
29 |
fonttools==4.58.0
|
30 |
frozenlist==1.6.0
|
31 |
-
fsspec==2025.3.
|
32 |
google-api-core==2.24.2
|
33 |
google-api-python-client==2.169.0
|
34 |
google-auth==2.40.1
|
@@ -72,6 +74,7 @@ matplotlib==3.10.3
|
|
72 |
mdurl==0.1.2
|
73 |
mpmath==1.3.0
|
74 |
multidict==6.4.3
|
|
|
75 |
mypy_extensions==1.1.0
|
76 |
narwhals==1.39.1
|
77 |
numpy==2.2.5
|
@@ -86,6 +89,7 @@ primp==0.15.0
|
|
86 |
propcache==0.3.1
|
87 |
proto-plus==1.26.1
|
88 |
protobuf==6.31.0
|
|
|
89 |
pyasn1==0.6.1
|
90 |
pyasn1_modules==0.4.2
|
91 |
pyAudioAnalysis==0.3.14
|
@@ -137,6 +141,8 @@ uvicorn==0.34.2
|
|
137 |
websockets==15.0.1
|
138 |
wikipedia==1.4.0
|
139 |
Wikipedia-API==0.8.1
|
|
|
140 |
yarl==1.20.0
|
|
|
141 |
youtube-transcript-api==1.0.3
|
142 |
zstandard==0.23.0
|
|
|
7 |
arxiv==2.2.0
|
8 |
attrs==25.3.0
|
9 |
beautifulsoup4==4.13.4
|
10 |
+
bs4==0.0.2
|
11 |
cachetools==5.5.2
|
12 |
certifi==2025.4.26
|
13 |
charset-normalizer==3.4.2
|
|
|
19 |
dataclasses-json==0.6.7
|
20 |
defusedxml==0.7.1
|
21 |
deprecation==2.1.0
|
22 |
+
dill==0.3.8
|
23 |
duckduckgo_search==8.0.2
|
24 |
eyeD3==0.9.8
|
25 |
fastapi==0.115.12
|
|
|
30 |
flatbuffers==25.2.10
|
31 |
fonttools==4.58.0
|
32 |
frozenlist==1.6.0
|
33 |
+
fsspec==2025.3.0
|
34 |
google-api-core==2.24.2
|
35 |
google-api-python-client==2.169.0
|
36 |
google-auth==2.40.1
|
|
|
74 |
mdurl==0.1.2
|
75 |
mpmath==1.3.0
|
76 |
multidict==6.4.3
|
77 |
+
multiprocess==0.70.16
|
78 |
mypy_extensions==1.1.0
|
79 |
narwhals==1.39.1
|
80 |
numpy==2.2.5
|
|
|
89 |
propcache==0.3.1
|
90 |
proto-plus==1.26.1
|
91 |
protobuf==6.31.0
|
92 |
+
pyarrow==20.0.0
|
93 |
pyasn1==0.6.1
|
94 |
pyasn1_modules==0.4.2
|
95 |
pyAudioAnalysis==0.3.14
|
|
|
141 |
websockets==15.0.1
|
142 |
wikipedia==1.4.0
|
143 |
Wikipedia-API==0.8.1
|
144 |
+
xxhash==3.5.0
|
145 |
yarl==1.20.0
|
146 |
+
youtube-dl==2021.12.17
|
147 |
youtube-transcript-api==1.0.3
|
148 |
zstandard==0.23.0
|
vlm_tools.py
CHANGED
@@ -73,39 +73,26 @@ def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nm
|
|
73 |
return detected_objects
|
74 |
|
75 |
@tool
|
76 |
-
def
|
77 |
"""
|
78 |
-
Extract frames from a video
|
79 |
Args:
|
80 |
video_path: The path to the video file
|
81 |
Returns:
|
82 |
-
A list of frames as numpy arrays
|
83 |
"""
|
84 |
cap = cv2.VideoCapture(video_path)
|
85 |
-
|
86 |
while cap.isOpened():
|
87 |
-
ret,
|
88 |
if not ret:
|
89 |
break
|
90 |
-
|
91 |
cap.release()
|
92 |
-
return
|
93 |
|
94 |
@tool
|
95 |
-
def
|
96 |
-
"""
|
97 |
-
Download an image from a url
|
98 |
-
Args:
|
99 |
-
image_url: The url of the image to download
|
100 |
-
Returns:
|
101 |
-
The image as a base64 string
|
102 |
-
"""
|
103 |
-
response = requests.get(image_url)
|
104 |
-
image = base64.b64encode(response.content).decode('utf-8')
|
105 |
-
return image
|
106 |
-
|
107 |
-
@tool
|
108 |
-
def get_image_from_file(file_path: str)->str:
|
109 |
"""
|
110 |
Get an image from a file
|
111 |
Args:
|
@@ -138,6 +125,89 @@ def get_image_from_file(file_path: str)->str:
|
|
138 |
image = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
139 |
return image
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
@tool
|
142 |
def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
|
143 |
"""
|
@@ -166,13 +236,13 @@ names_path = "vlm_assets/obj.names"
|
|
166 |
class ObjectDetectionTool(Tool):
|
167 |
name = "object_detection"
|
168 |
description = """
|
169 |
-
Detect objects in a list of
|
170 |
-
It takes a list of
|
171 |
a list of detected objects with labels, confidence, and bounding boxes.
|
172 |
The output type will be List[List[str]]
|
173 |
"""
|
174 |
inputs = {
|
175 |
-
"
|
176 |
}
|
177 |
output_type = "any"
|
178 |
|
@@ -197,10 +267,10 @@ class ObjectDetectionTool(Tool):
|
|
197 |
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
|
198 |
]
|
199 |
|
200 |
-
def forward(self,
|
201 |
detected_objects = []
|
202 |
-
for
|
203 |
-
img = pre_processing(
|
204 |
|
205 |
# Preprocess the image
|
206 |
blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
@@ -214,20 +284,20 @@ class ObjectDetectionTool(Tool):
|
|
214 |
class OCRTool(Tool):
|
215 |
description = """
|
216 |
Scan an image for text.
|
217 |
-
It takes a list of
|
218 |
a list of text in the images.
|
219 |
The output type will be List[List[str]]
|
220 |
"""
|
221 |
name = "ocr_scan"
|
222 |
inputs = {
|
223 |
-
"
|
224 |
}
|
225 |
output_type = "any"
|
226 |
|
227 |
-
def forward(self,
|
228 |
scanned_text = []
|
229 |
-
for
|
230 |
-
image_data = base64.b64decode(
|
231 |
img = Image.open(BytesIO(image_data))
|
232 |
scanned_text.append(pytesseract.image_to_string(img))
|
233 |
return scanned_text
|
|
|
73 |
return detected_objects
|
74 |
|
75 |
@tool
|
76 |
+
def extract_images_from_video(video_path: str) -> list:
|
77 |
"""
|
78 |
+
Extract images (frames) from a video
|
79 |
Args:
|
80 |
video_path: The path to the video file
|
81 |
Returns:
|
82 |
+
A list of images (frames) as numpy arrays
|
83 |
"""
|
84 |
cap = cv2.VideoCapture(video_path)
|
85 |
+
images = []
|
86 |
while cap.isOpened():
|
87 |
+
ret, image = cap.read()
|
88 |
if not ret:
|
89 |
break
|
90 |
+
images.append(image)
|
91 |
cap.release()
|
92 |
+
return images
|
93 |
|
94 |
@tool
|
95 |
+
def get_image_from_file_path(file_path: str)->str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
"""
|
97 |
Get an image from a file
|
98 |
Args:
|
|
|
125 |
image = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
126 |
return image
|
127 |
|
128 |
+
@tool
|
129 |
+
def get_video_from_file_path(file_path: str)->str:
|
130 |
+
"""
|
131 |
+
Get a video from a file using cv2 and BytesIO
|
132 |
+
Args:
|
133 |
+
file_path: The path to the file
|
134 |
+
Returns:
|
135 |
+
The video as a base64 string
|
136 |
+
"""
|
137 |
+
try:
|
138 |
+
# Use cv2 to read the video
|
139 |
+
cap = cv2.VideoCapture(file_path)
|
140 |
+
if not cap.isOpened():
|
141 |
+
raise FileNotFoundError(f"Could not read video at {file_path}")
|
142 |
+
|
143 |
+
# Get video properties
|
144 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
145 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
146 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
147 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
148 |
+
|
149 |
+
# Create a BytesIO buffer to store the images (frames)
|
150 |
+
images = []
|
151 |
+
while cap.isOpened():
|
152 |
+
ret, image = cap.read()
|
153 |
+
if not ret:
|
154 |
+
break
|
155 |
+
# Convert frame to jpg and store in memory
|
156 |
+
_, buffer = cv2.imencode('.jpg', image)
|
157 |
+
images.append(buffer.tobytes())
|
158 |
+
|
159 |
+
# Release the video capture
|
160 |
+
cap.release()
|
161 |
+
|
162 |
+
# Combine all images into a single buffer
|
163 |
+
with BytesIO() as buffer:
|
164 |
+
# Write each image to the buffer
|
165 |
+
for image_data in images:
|
166 |
+
buffer.write(image_data)
|
167 |
+
|
168 |
+
# Encode to base64
|
169 |
+
video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
170 |
+
|
171 |
+
except Exception as e:
|
172 |
+
current_file_path = os.path.abspath(__file__)
|
173 |
+
current_file_dir = os.path.dirname(current_file_path)
|
174 |
+
file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
|
175 |
+
|
176 |
+
# Try again with the new path
|
177 |
+
cap = cv2.VideoCapture(file_path)
|
178 |
+
if not cap.isOpened():
|
179 |
+
raise FileNotFoundError(f"Could not read video at {file_path}")
|
180 |
+
|
181 |
+
# Get video properties
|
182 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
183 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
184 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
185 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
186 |
+
|
187 |
+
# Create a BytesIO buffer to store the images (frames)
|
188 |
+
images = []
|
189 |
+
while cap.isOpened():
|
190 |
+
ret, image = cap.read()
|
191 |
+
if not ret:
|
192 |
+
break
|
193 |
+
# Convert image to jpg and store in memory
|
194 |
+
_, buffer = cv2.imencode('.jpg', image)
|
195 |
+
images.append(buffer.tobytes())
|
196 |
+
|
197 |
+
# Release the video capture
|
198 |
+
cap.release()
|
199 |
+
|
200 |
+
# Combine all images into a single buffer
|
201 |
+
with BytesIO() as buffer:
|
202 |
+
# Write each image to the buffer
|
203 |
+
for image_data in images:
|
204 |
+
buffer.write(image_data)
|
205 |
+
|
206 |
+
# Encode to base64
|
207 |
+
video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
208 |
+
|
209 |
+
return video_base64
|
210 |
+
|
211 |
@tool
|
212 |
def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
|
213 |
"""
|
|
|
236 |
class ObjectDetectionTool(Tool):
|
237 |
name = "object_detection"
|
238 |
description = """
|
239 |
+
Detect objects in a list of images.
|
240 |
+
It takes a list of images as input and returns
|
241 |
a list of detected objects with labels, confidence, and bounding boxes.
|
242 |
The output type will be List[List[str]]
|
243 |
"""
|
244 |
inputs = {
|
245 |
+
"images": {"type": "any", "description": "The list of images to detect objects in. Must be a List[str] or a List[np.ndarray]"}
|
246 |
}
|
247 |
output_type = "any"
|
248 |
|
|
|
267 |
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
|
268 |
]
|
269 |
|
270 |
+
def forward(self, images: any)->any:
|
271 |
detected_objects = []
|
272 |
+
for image in images:
|
273 |
+
img = pre_processing(image)
|
274 |
|
275 |
# Preprocess the image
|
276 |
blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
|
|
284 |
class OCRTool(Tool):
|
285 |
description = """
|
286 |
Scan an image for text.
|
287 |
+
It takes a list of images as input and returns
|
288 |
a list of text in the images.
|
289 |
The output type will be List[List[str]]
|
290 |
"""
|
291 |
name = "ocr_scan"
|
292 |
inputs = {
|
293 |
+
"images": {"type": "any", "description": "The list of images to scan for text. Must be a List[str] or a List[np.ndarray]"}
|
294 |
}
|
295 |
output_type = "any"
|
296 |
|
297 |
+
def forward(self, images: any)->any:
|
298 |
scanned_text = []
|
299 |
+
for image in images:
|
300 |
+
image_data = base64.b64decode(image)
|
301 |
img = Image.open(BytesIO(image_data))
|
302 |
scanned_text.append(pytesseract.image_to_string(img))
|
303 |
return scanned_text
|