Spaces:
Sleeping
Sleeping
Fix tools
Browse files- audio_tools.py +40 -0
- tools.py +4 -4
- vlm_tools.py +5 -5
audio_tools.py
CHANGED
@@ -1,9 +1,49 @@
|
|
1 |
import base64
|
|
|
2 |
from langchain_core.tools import tool as langchain_tool
|
3 |
from smolagents.tools import Tool, tool
|
4 |
from pydub import AudioSegment
|
5 |
from pyAudioAnalysis import audioSegmentation as aS
|
6 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
@tool
|
9 |
def audio_to_base64(file_path: str) -> str:
|
|
|
1 |
import base64
|
2 |
+
import os
|
3 |
from langchain_core.tools import tool as langchain_tool
|
4 |
from smolagents.tools import Tool, tool
|
5 |
from pydub import AudioSegment
|
6 |
from pyAudioAnalysis import audioSegmentation as aS
|
7 |
from io import BytesIO
|
8 |
+
from huggingface_hub import InferenceClient
|
9 |
+
|
10 |
+
class TranscribeAudioTool(Tool):
|
11 |
+
name = "transcribe_audio"
|
12 |
+
description = "Transcribe an audio file"
|
13 |
+
inputs = {
|
14 |
+
"type": "object",
|
15 |
+
"properties": {
|
16 |
+
"audio": {"type": "string", "description": "The audio file in base64 format"}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
output_type = "string"
|
20 |
+
|
21 |
+
def setup(self):
|
22 |
+
self.model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
|
23 |
+
|
24 |
+
def forward(self, audio: str) -> str:
|
25 |
+
audio_data = base64.b64decode(audio)
|
26 |
+
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
27 |
+
result = self.model.automatic_speech_recognition(audio_segment)
|
28 |
+
return result["text"]
|
29 |
+
|
30 |
+
transcribe_audio_tool = TranscribeAudioTool()
|
31 |
+
|
32 |
+
@tool
|
33 |
+
def transcribe_audio(audio: str) -> str:
|
34 |
+
"""
|
35 |
+
Transcribe an audio file
|
36 |
+
Args:
|
37 |
+
audio: The audio file in base64 format
|
38 |
+
Returns:
|
39 |
+
The transcribed text
|
40 |
+
"""
|
41 |
+
model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
|
42 |
+
audio_data = base64.b64decode(audio)
|
43 |
+
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
44 |
+
result = model.automatic_speech_recognition(audio_segment)
|
45 |
+
return result["text"]
|
46 |
+
|
47 |
|
48 |
@tool
|
49 |
def audio_to_base64(file_path: str) -> str:
|
tools.py
CHANGED
@@ -2,7 +2,8 @@ from langchain_core.tools import tool as langchain_tool
|
|
2 |
from smolagents.tools import Tool, tool
|
3 |
from datetime import datetime
|
4 |
from typing import Literal, List, Union
|
5 |
-
from smolagents import
|
|
|
6 |
import pandas as pd
|
7 |
|
8 |
@tool
|
@@ -58,10 +59,9 @@ def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "
|
|
58 |
return sorted(my_list, reverse=how[order] == "desc")
|
59 |
|
60 |
#smolagents tools
|
61 |
-
web_search_tool = WebSearchTool()
|
62 |
-
duckduckgo_search_tool = DuckDuckGoSearchTool()
|
63 |
visit_webpage_tool = VisitWebpageTool()
|
64 |
-
|
|
|
65 |
|
66 |
@tool
|
67 |
def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
|
|
|
2 |
from smolagents.tools import Tool, tool
|
3 |
from datetime import datetime
|
4 |
from typing import Literal, List, Union
|
5 |
+
from smolagents import VisitWebpageTool
|
6 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
7 |
import pandas as pd
|
8 |
|
9 |
@tool
|
|
|
59 |
return sorted(my_list, reverse=how[order] == "desc")
|
60 |
|
61 |
#smolagents tools
|
|
|
|
|
62 |
visit_webpage_tool = VisitWebpageTool()
|
63 |
+
tavily_search_tool = TavilySearchResults(k=3)
|
64 |
+
|
65 |
|
66 |
@tool
|
67 |
def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
|
vlm_tools.py
CHANGED
@@ -129,13 +129,13 @@ onnx_path = "vlm_assets/yolov3-8.onnx"
|
|
129 |
names_path = "vlm_assets/obj.names"
|
130 |
|
131 |
class ObjectDetectionTool(Tool):
|
|
|
132 |
description = """
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
"""
|
138 |
-
name = "object_detection"
|
139 |
inputs = {
|
140 |
"frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
|
141 |
}
|
|
|
129 |
names_path = "vlm_assets/obj.names"
|
130 |
|
131 |
class ObjectDetectionTool(Tool):
|
132 |
+
name = "object_detection"
|
133 |
description = """
|
134 |
+
Detect objects in a list of frames (images).
|
135 |
+
It takes a list of frames (images) as input and returns
|
136 |
+
a list of detected objects with labels, confidence, and bounding boxes.
|
137 |
+
The output type will be List[List[str]]
|
138 |
"""
|
|
|
139 |
inputs = {
|
140 |
"frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
|
141 |
}
|