Spaces:
Sleeping
Sleeping
Remove usage of open() functions
Browse files- audio_tools.py +10 -6
- community_tools.py +17 -2
- mini_agents.py +2 -2
- utils.py +0 -22
- vlm_tools.py +34 -9
audio_tools.py
CHANGED
@@ -115,12 +115,16 @@ def speaker_diarization(audio: str) -> list:
|
|
115 |
"""
|
116 |
# Decode the base64 audio
|
117 |
audio_data = base64.b64decode(audio)
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
124 |
|
125 |
# Process the output
|
126 |
speaker_segments = []
|
|
|
115 |
"""
|
116 |
# Decode the base64 audio
|
117 |
audio_data = base64.b64decode(audio)
|
118 |
+
audio_buffer = BytesIO(audio_data)
|
119 |
+
|
120 |
+
# Create a temporary BytesIO object for processing
|
121 |
+
temp_buffer = BytesIO()
|
122 |
+
audio_segment = AudioSegment.from_file(audio_buffer)
|
123 |
+
audio_segment.export(temp_buffer, format="wav")
|
124 |
+
temp_buffer.seek(0)
|
125 |
+
|
126 |
+
# Perform speaker diarization using the buffer
|
127 |
+
[flags, classes, centers] = aS.speakerDiarization(temp_buffer, 2) # Assuming 2 speakers
|
128 |
|
129 |
# Process the output
|
130 |
speaker_segments = []
|
community_tools.py
CHANGED
@@ -3,6 +3,8 @@ from langchain_community.agent_toolkits.load_tools import load_tools
|
|
3 |
from langchain_community.document_loaders import YoutubeLoader
|
4 |
from smolagents.tools import Tool, tool
|
5 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
|
6 |
|
7 |
google_map_tool = Tool.from_langchain(GooglePlacesTool())
|
8 |
|
@@ -18,7 +20,7 @@ community_tools = [google_map_tool, wikipedia_tool, *arxiv_tools]
|
|
18 |
@tool
|
19 |
def get_youtube_transcript_from_url(video_url: str)->str:
|
20 |
"""
|
21 |
-
Get the transcript of a YouTube video
|
22 |
Args:
|
23 |
video_url: The URL of the YouTube video (e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ)
|
24 |
Returns:
|
@@ -26,7 +28,20 @@ def get_youtube_transcript_from_url(video_url: str)->str:
|
|
26 |
"""
|
27 |
video_id = video_url.split("=")[1]
|
28 |
try:
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
fetched_transcript = ytt_api.fetch(video_id)
|
31 |
|
32 |
# is iterable
|
|
|
3 |
from langchain_community.document_loaders import YoutubeLoader
|
4 |
from smolagents.tools import Tool, tool
|
5 |
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
+
from youtube_transcript_api.proxies import WebshareProxyConfig
|
7 |
+
import os
|
8 |
|
9 |
google_map_tool = Tool.from_langchain(GooglePlacesTool())
|
10 |
|
|
|
20 |
@tool
|
21 |
def get_youtube_transcript_from_url(video_url: str)->str:
|
22 |
"""
|
23 |
+
Get the transcript of a YouTube video using proxy configuration
|
24 |
Args:
|
25 |
video_url: The URL of the YouTube video (e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ)
|
26 |
Returns:
|
|
|
28 |
"""
|
29 |
video_id = video_url.split("=")[1]
|
30 |
try:
|
31 |
+
# Get proxy credentials from environment variables
|
32 |
+
proxy_username = os.getenv("WEBSHARE_PROXY_USERNAME")
|
33 |
+
proxy_password = os.getenv("WEBSHARE_PROXY_PASSWORD")
|
34 |
+
|
35 |
+
# Configure proxy if credentials are available
|
36 |
+
if proxy_username and proxy_password:
|
37 |
+
proxy_config = WebshareProxyConfig(
|
38 |
+
proxy_username=proxy_username,
|
39 |
+
proxy_password=proxy_password,
|
40 |
+
)
|
41 |
+
ytt_api = YouTubeTranscriptApi(proxy_config=proxy_config)
|
42 |
+
else:
|
43 |
+
ytt_api = YouTubeTranscriptApi()
|
44 |
+
|
45 |
fetched_transcript = ytt_api.fetch(video_id)
|
46 |
|
47 |
# is iterable
|
mini_agents.py
CHANGED
@@ -41,7 +41,7 @@ audio_agent = CodeAgent(
|
|
41 |
tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
|
42 |
max_steps=6,
|
43 |
# prompt_templates=PROMPT_TEMPLATE["audio_agent"],
|
44 |
-
additional_authorized_imports=["pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
|
45 |
name="audio_agent",
|
46 |
description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it."
|
47 |
)
|
@@ -56,7 +56,7 @@ vlm_agent = CodeAgent(
|
|
56 |
tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file],
|
57 |
max_steps=6,
|
58 |
# prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
|
59 |
-
additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
|
60 |
name="vlm_agent",
|
61 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
62 |
)
|
|
|
41 |
tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
|
42 |
max_steps=6,
|
43 |
# prompt_templates=PROMPT_TEMPLATE["audio_agent"],
|
44 |
+
additional_authorized_imports=["pytube", "pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
|
45 |
name="audio_agent",
|
46 |
description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it."
|
47 |
)
|
|
|
56 |
tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file],
|
57 |
max_steps=6,
|
58 |
# prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
|
59 |
+
additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'youtube_dl', 'bs4'],
|
60 |
name="vlm_agent",
|
61 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
62 |
)
|
utils.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
from typing import Optional
|
4 |
-
from smolagents.tools import tool
|
5 |
|
6 |
def find_file_by_task_id(task_id: str, metadata_path: str = "Final_Assignment_Template/validation/metadata.jsonl") -> Optional[str]:
|
7 |
"""
|
@@ -58,24 +57,3 @@ def get_full_file_path(task_id: str, base_dir: str = "Final_Assignment_Template/
|
|
58 |
|
59 |
full_path = os.path.join(base_dir, filename)
|
60 |
return full_path if os.path.exists(full_path) else None
|
61 |
-
|
62 |
-
@tool
|
63 |
-
def load_file_from_task_id(task_id: str) -> str:
|
64 |
-
"""
|
65 |
-
Load a file related to a given task_id if it exists.
|
66 |
-
Args:
|
67 |
-
task_id: The task_id to load the file for
|
68 |
-
Returns:
|
69 |
-
The file content if found, None if not found
|
70 |
-
"""
|
71 |
-
file_path = get_full_file_path(task_id)
|
72 |
-
if not file_path:
|
73 |
-
return "File not found"
|
74 |
-
with open(file_path, 'r') as file:
|
75 |
-
try:
|
76 |
-
return file.read()
|
77 |
-
except Exception as e:
|
78 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
79 |
-
file_path = os.path.join(current_dir, file_path.replace("Final_Assignment_Template", ""))
|
80 |
-
with open(file_path, 'r') as file:
|
81 |
-
return file.read()
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from typing import Optional
|
|
|
4 |
|
5 |
def find_file_by_task_id(task_id: str, metadata_path: str = "Final_Assignment_Template/validation/metadata.jsonl") -> Optional[str]:
|
6 |
"""
|
|
|
57 |
|
58 |
full_path = os.path.join(base_dir, filename)
|
59 |
return full_path if os.path.exists(full_path) else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vlm_tools.py
CHANGED
@@ -114,14 +114,28 @@ def get_image_from_file(file_path: str)->str:
|
|
114 |
The image as a base64 string
|
115 |
"""
|
116 |
try:
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
except Exception as e:
|
120 |
current_file_path = os.path.abspath(__file__)
|
121 |
current_file_dir = os.path.dirname(current_file_path)
|
122 |
file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
|
123 |
-
with
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
return image
|
126 |
|
127 |
@tool
|
@@ -167,12 +181,23 @@ class ObjectDetectionTool(Tool):
|
|
167 |
self.onnx_path = onnx_path
|
168 |
self.names_path = names_path
|
169 |
self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
def forward(self, frames: any)->any:
|
172 |
-
# Load class labels
|
173 |
-
with open(self.names_path, 'r') as f:
|
174 |
-
classes = [line.strip() for line in f.readlines()]
|
175 |
-
|
176 |
detected_objects = []
|
177 |
for frame in frames:
|
178 |
img = pre_processing(frame)
|
@@ -182,7 +207,7 @@ class ObjectDetectionTool(Tool):
|
|
182 |
onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
|
183 |
onnx_output = self.onnx_model.run(None, onnx_input)
|
184 |
|
185 |
-
detected_objects.append(post_processing(onnx_output, classes, img.shape))
|
186 |
|
187 |
return detected_objects
|
188 |
|
|
|
114 |
The image as a base64 string
|
115 |
"""
|
116 |
try:
|
117 |
+
# Use BytesIO to read the file
|
118 |
+
with BytesIO() as buffer:
|
119 |
+
# Use cv2 to read the image
|
120 |
+
img = cv2.imread(file_path)
|
121 |
+
if img is None:
|
122 |
+
raise FileNotFoundError(f"Could not read image at {file_path}")
|
123 |
+
# Encode to jpg and write to buffer
|
124 |
+
_, buffer_data = cv2.imencode('.jpg', img)
|
125 |
+
buffer.write(buffer_data.tobytes())
|
126 |
+
image = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
127 |
except Exception as e:
|
128 |
current_file_path = os.path.abspath(__file__)
|
129 |
current_file_dir = os.path.dirname(current_file_path)
|
130 |
file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
|
131 |
+
# Try again with the new path
|
132 |
+
with BytesIO() as buffer:
|
133 |
+
img = cv2.imread(file_path)
|
134 |
+
if img is None:
|
135 |
+
raise FileNotFoundError(f"Could not read image at {file_path}")
|
136 |
+
_, buffer_data = cv2.imencode('.jpg', img)
|
137 |
+
buffer.write(buffer_data.tobytes())
|
138 |
+
image = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
139 |
return image
|
140 |
|
141 |
@tool
|
|
|
181 |
self.onnx_path = onnx_path
|
182 |
self.names_path = names_path
|
183 |
self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
|
184 |
+
|
185 |
+
# Load class labels - using a predefined list since we can't use open()
|
186 |
+
# These are the standard COCO dataset classes that YOLOv3 uses
|
187 |
+
self.classes = [
|
188 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
|
189 |
+
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
|
190 |
+
'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
|
191 |
+
'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
|
192 |
+
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
|
193 |
+
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
|
194 |
+
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
|
195 |
+
'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
196 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
|
197 |
+
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
|
198 |
+
]
|
199 |
|
200 |
def forward(self, frames: any)->any:
|
|
|
|
|
|
|
|
|
201 |
detected_objects = []
|
202 |
for frame in frames:
|
203 |
img = pre_processing(frame)
|
|
|
207 |
onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
|
208 |
onnx_output = self.onnx_model.run(None, onnx_input)
|
209 |
|
210 |
+
detected_objects.append(post_processing(onnx_output, self.classes, img.shape))
|
211 |
|
212 |
return detected_objects
|
213 |
|