huytofu92 commited on
Commit
55d6080
·
1 Parent(s): 82d0896
Files changed (3) hide show
  1. audio_tools.py +40 -0
  2. tools.py +4 -4
  3. vlm_tools.py +5 -5
audio_tools.py CHANGED
@@ -1,9 +1,49 @@
1
  import base64
 
2
  from langchain_core.tools import tool as langchain_tool
3
  from smolagents.tools import Tool, tool
4
  from pydub import AudioSegment
5
  from pyAudioAnalysis import audioSegmentation as aS
6
  from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  @tool
9
  def audio_to_base64(file_path: str) -> str:
 
1
  import base64
2
+ import os
3
  from langchain_core.tools import tool as langchain_tool
4
  from smolagents.tools import Tool, tool
5
  from pydub import AudioSegment
6
  from pyAudioAnalysis import audioSegmentation as aS
7
  from io import BytesIO
8
+ from huggingface_hub import InferenceClient
9
+
10
+ class TranscribeAudioTool(Tool):
11
+ name = "transcribe_audio"
12
+ description = "Transcribe an audio file"
13
+ inputs = {
14
+ "type": "object",
15
+ "properties": {
16
+ "audio": {"type": "string", "description": "The audio file in base64 format"}
17
+ }
18
+ }
19
+ output_type = "string"
20
+
21
+ def setup(self):
22
+ self.model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
23
+
24
+ def forward(self, audio: str) -> str:
25
+ audio_data = base64.b64decode(audio)
26
+ audio_segment = AudioSegment.from_file(BytesIO(audio_data))
27
+ result = self.model.automatic_speech_recognition(audio_segment)
28
+ return result["text"]
29
+
30
+ transcribe_audio_tool = TranscribeAudioTool()
31
+
32
+ @tool
33
+ def transcribe_audio(audio: str) -> str:
34
+ """
35
+ Transcribe an audio file
36
+ Args:
37
+ audio: The audio file in base64 format
38
+ Returns:
39
+ The transcribed text
40
+ """
41
+ model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
42
+ audio_data = base64.b64decode(audio)
43
+ audio_segment = AudioSegment.from_file(BytesIO(audio_data))
44
+ result = model.automatic_speech_recognition(audio_segment)
45
+ return result["text"]
46
+
47
 
48
  @tool
49
  def audio_to_base64(file_path: str) -> str:
tools.py CHANGED
@@ -2,7 +2,8 @@ from langchain_core.tools import tool as langchain_tool
2
  from smolagents.tools import Tool, tool
3
  from datetime import datetime
4
  from typing import Literal, List, Union
5
- from smolagents import WebSearchTool, DuckDuckGoSearchTool, VisitWebpageTool, WikipediaSearchTool
 
6
  import pandas as pd
7
 
8
  @tool
@@ -58,10 +59,9 @@ def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "
58
  return sorted(my_list, reverse=how[order] == "desc")
59
 
60
  #smolagents tools
61
- web_search_tool = WebSearchTool()
62
- duckduckgo_search_tool = DuckDuckGoSearchTool()
63
  visit_webpage_tool = VisitWebpageTool()
64
- wikipedia_search_tool = WikipediaSearchTool()
 
65
 
66
  @tool
67
  def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
 
2
  from smolagents.tools import Tool, tool
3
  from datetime import datetime
4
  from typing import Literal, List, Union
5
+ from smolagents import VisitWebpageTool
6
+ from langchain_community.tools.tavily_search import TavilySearchResults
7
  import pandas as pd
8
 
9
  @tool
 
59
  return sorted(my_list, reverse=how[order] == "desc")
60
 
61
  #smolagents tools
 
 
62
  visit_webpage_tool = VisitWebpageTool()
63
+ tavily_search_tool = TavilySearchResults(k=3)
64
+
65
 
66
  @tool
67
  def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
vlm_tools.py CHANGED
@@ -129,13 +129,13 @@ onnx_path = "vlm_assets/yolov3-8.onnx"
129
  names_path = "vlm_assets/obj.names"
130
 
131
  class ObjectDetectionTool(Tool):
 
132
  description = """
133
- Detect objects in a list of frames (images).
134
- It takes a list of frames (images) as input and returns
135
- a list of detected objects with labels, confidence, and bounding boxes.
136
- The output type will be List[List[str]]
137
  """
138
- name = "object_detection"
139
  inputs = {
140
  "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
141
  }
 
129
  names_path = "vlm_assets/obj.names"
130
 
131
  class ObjectDetectionTool(Tool):
132
+ name = "object_detection"
133
  description = """
134
+ Detect objects in a list of frames (images).
135
+ It takes a list of frames (images) as input and returns
136
+ a list of detected objects with labels, confidence, and bounding boxes.
137
+ The output type will be List[List[str]]
138
  """
 
139
  inputs = {
140
  "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
141
  }