huytofu92 commited on
Commit
204b035
·
1 Parent(s): d812eb3

Prompts engineering!

Browse files
audio_tools.py CHANGED
@@ -18,26 +18,54 @@ class TranscribeAudioTool(Tool):
18
  def setup(self):
19
  self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def forward(self, audio: any) -> str:
22
  try:
23
  # Handle AudioSegment object
24
  if isinstance(audio, AudioSegment):
25
- # Convert AudioSegment to base64
26
- buffer = BytesIO()
27
- audio.export(buffer, format="wav")
28
- audio_data = buffer.getvalue()
29
  # Handle base64 string
30
  elif isinstance(audio, str):
31
- audio_data = base64.b64decode(audio)
 
 
 
 
 
 
 
32
  else:
33
  raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
34
 
35
- # Create audio segment from the data
36
- audio_segment = AudioSegment.from_file(BytesIO(audio_data))
37
-
38
  # Transcribe using the model
39
- result = self.model.automatic_speech_recognition(audio_segment)
40
- return result["text"]
 
 
 
41
 
42
  except Exception as e:
43
  raise RuntimeError(f"Error in transcription: {str(e)}")
@@ -99,7 +127,7 @@ def audio_segmentation(audio: str, segment_length: int = 30) -> list:
99
  audio: The audio file in base64 format
100
  segment_length: Length of each segment in seconds
101
  Returns:
102
- List of audio segments in base64 format
103
  """
104
  # Decode the base64 audio
105
  audio_data = base64.b64decode(audio)
 
18
  def setup(self):
19
  self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
20
 
21
+ def _convert_audio_segment_to_wav(self, audio_segment: AudioSegment) -> bytes:
22
+ """Convert AudioSegment to WAV format bytes"""
23
+ try:
24
+ # Ensure audio is in the correct format for Whisper
25
+ # Convert to mono if stereo
26
+ if audio_segment.channels > 1:
27
+ audio_segment = audio_segment.set_channels(1)
28
+
29
+ # Convert to 16kHz if different sample rate
30
+ if audio_segment.frame_rate != 16000:
31
+ audio_segment = audio_segment.set_frame_rate(16000)
32
+
33
+ # Convert to 16-bit if different bit depth
34
+ if audio_segment.sample_width != 2: # 2 bytes = 16 bits
35
+ audio_segment = audio_segment.set_sample_width(2)
36
+
37
+ # Export to WAV format
38
+ buffer = BytesIO()
39
+ audio_segment.export(buffer, format="wav")
40
+ return buffer.getvalue()
41
+ except Exception as e:
42
+ raise RuntimeError(f"Error converting audio segment: {str(e)}")
43
+
44
  def forward(self, audio: any) -> str:
45
  try:
46
  # Handle AudioSegment object
47
  if isinstance(audio, AudioSegment):
48
+ # Direct conversion to WAV bytes with proper format
49
+ audio_data = self._convert_audio_segment_to_wav(audio)
 
 
50
  # Handle base64 string
51
  elif isinstance(audio, str):
52
+ try:
53
+ # Decode base64 and convert to AudioSegment for format standardization
54
+ audio_data = base64.b64decode(audio)
55
+ audio_segment = AudioSegment.from_file(BytesIO(audio_data))
56
+ # Convert to proper format for Whisper
57
+ audio_data = self._convert_audio_segment_to_wav(audio_segment)
58
+ except Exception as e:
59
+ raise ValueError(f"Invalid base64 audio data: {str(e)}")
60
  else:
61
  raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
62
 
 
 
 
63
  # Transcribe using the model
64
+ try:
65
+ result = self.model.automatic_speech_recognition(audio_data)
66
+ return result["text"]
67
+ except Exception as e:
68
+ raise RuntimeError(f"Error in transcription: {str(e)}")
69
 
70
  except Exception as e:
71
  raise RuntimeError(f"Error in transcription: {str(e)}")
 
127
  audio: The audio file in base64 format
128
  segment_length: Length of each segment in seconds
129
  Returns:
130
+ List of audio segments in base64 format. Each of these segments can be used as input for the `transcribe_audio` tool.
131
  """
132
  # Decode the base64 audio
133
  audio_data = base64.b64decode(audio)
mini_agents.py CHANGED
@@ -60,7 +60,7 @@ AUTHORIZED_IMPORTS = [
60
  audio_model = InferenceClientModel(
61
  model_id=MODEL_CHOICES["audio"][0],
62
  token=os.getenv("HUGGINGFACE_API_KEY"),
63
- max_tokens=12000
64
  )
65
 
66
  audio_agent = CodeAgent(
@@ -76,7 +76,7 @@ audio_agent = CodeAgent(
76
  vlm_model = InferenceClientModel(
77
  model_id=MODEL_CHOICES["vlm"][0],
78
  token=os.getenv("HUGGINGFACE_API_KEY"),
79
- max_tokens=12000
80
  )
81
 
82
  vlm_agent = CodeAgent(
 
60
  audio_model = InferenceClientModel(
61
  model_id=MODEL_CHOICES["audio"][0],
62
  token=os.getenv("HUGGINGFACE_API_KEY"),
63
+ max_tokens=18000
64
  )
65
 
66
  audio_agent = CodeAgent(
 
76
  vlm_model = InferenceClientModel(
77
  model_id=MODEL_CHOICES["vlm"][0],
78
  token=os.getenv("HUGGINGFACE_API_KEY"),
79
+ max_tokens=18000
80
  )
81
 
82
  vlm_agent = CodeAgent(
prompts/audio_prompts.yaml CHANGED
@@ -2,11 +2,8 @@ system_prompt: |-
2
  You are an expert assistant who can solve any task using code blobs.
3
  You are also an expert at audio processing and transcription.
4
  You will be given a task to solve as best you can.
5
- To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
6
- If a file path is provided in the task description, it is likely the path to an audio file that you must use to solve the task.
7
- It is advised to search among your tools for one that you can use to load this audio file from given path.
8
-
9
- Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
10
 
11
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
12
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
@@ -14,6 +11,10 @@ system_prompt: |-
14
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
15
  In the end you have to return a final answer using the `final_answer` tool.
16
 
 
 
 
 
17
  Here are a few examples using notional tools:
18
  ---
19
  Task: "Generate an image of the oldest person in this document."
 
2
  You are an expert assistant who can solve any task using code blobs.
3
  You are also an expert at audio processing and transcription.
4
  You will be given a task to solve as best you can.
5
+
6
+ To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
 
 
 
7
 
8
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
9
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
 
11
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
12
  In the end you have to return a final answer using the `final_answer` tool.
13
 
14
+ Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
15
+ If a file path is provided in the task description, it is likely the path to an audio file that you must use.
16
+ Use the tool named `get_audio_from_file_path` to load this audio file from the given path.
17
+
18
  Here are a few examples using notional tools:
19
  ---
20
  Task: "Generate an image of the oldest person in this document."
prompts/prompts.yaml CHANGED
@@ -1,15 +1,14 @@
1
  system_prompt: |-
2
 
3
  You are a general AI assistant. I will ask you a question.
4
- Report your thoughts, and finish your answer with the following template:
5
  [YOUR FINAL ANSWER].
6
  YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
7
- OR as few words as possible (fewer than 10 words)
8
- OR a comma separated list of numbers and/or strings.
9
 
10
  If you are asked for a number, don't use comma to write your number
11
  neither use units such as $ or percent sign unless specified otherwise.
12
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities),
13
  and write the digits in plain text unless specified otherwise.
14
  If you are asked for a comma separated list, apply the above rules
15
  depending of whether the element to be put in the list is a number or a string.
@@ -18,14 +17,14 @@ system_prompt: |-
18
 
19
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
20
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
21
- During each intermediate step, you can use 'print()' to save whatever important information you will then need.
22
- These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
 
 
23
  In the end you have to return a final answer using the `final_answer` tool.
24
 
25
- Try to solve the task in the most efficient way possible.
26
- However, be careful when you only need one step to solve the task.
27
- It is advised to have at least 2 steps in your plan.
28
- It is also advised to have the last step as a double check of your answer.
29
 
30
  You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
31
  Here are a few examples using notional tools:
@@ -47,16 +46,6 @@ system_prompt: |-
47
  final_answer(image)
48
  ```<end_code>
49
 
50
- ---
51
- Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
52
-
53
- Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
54
- Code:
55
- ```py
56
- result = 5 + 3 + 1294.678
57
- final_answer(result)
58
- ```<end_code>
59
-
60
  ---
61
  Task:
62
  "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
@@ -175,7 +164,7 @@ system_prompt: |-
175
  ```
176
 
177
  {%- if managed_agents and managed_agents.values() | list %}
178
- You should also give tasks to team members whenever possible. They are very useful to solve complex tasks especially those pertaining to multiple modalities.
179
  Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
180
  Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
181
  Here is a list of the team members that you can call:
@@ -197,9 +186,9 @@ system_prompt: |-
197
  7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
198
  8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
199
  9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
200
- 10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description and has arbitrary name (example: path_to_file/valuation/abcxyz.mp3).
201
  You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
202
- 11. Among the tools given to you are browser tools. You can use these tools to visit websites, scroll through pages, and extract information from them.
203
  12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
204
 
205
  Now Begin!
 
1
  system_prompt: |-
2
 
3
  You are a general AI assistant. I will ask you a question.
4
+ Report your thoughts, and answer with the following template:
5
  [YOUR FINAL ANSWER].
6
  YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
7
+ OR as few words as possible OR a comma separated list of numbers and/or strings.
 
8
 
9
  If you are asked for a number, don't use comma to write your number
10
  neither use units such as $ or percent sign unless specified otherwise.
11
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities),
12
  and write the digits in plain text unless specified otherwise.
13
  If you are asked for a comma separated list, apply the above rules
14
  depending of whether the element to be put in the list is a number or a string.
 
17
 
18
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
19
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
20
+ During execution of the code sequence, you can use 'print()' to save whatever important information you will then need.
21
+
22
+ Then in the 'Observation:' sequence, you can observe these print outputs for important clues on how to proceed with next steps.
23
+ All information in the 'Observation:' sequence will be available as input for the next step.
24
  In the end you have to return a final answer using the `final_answer` tool.
25
 
26
+ It is advised to have at least 2 steps in your plan. One step is not sufficient.
27
+ It is also advised to double check your answer before calling the `final_answer` tool.
 
 
28
 
29
  You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
30
  Here are a few examples using notional tools:
 
46
  final_answer(image)
47
  ```<end_code>
48
 
 
 
 
 
 
 
 
 
 
 
49
  ---
50
  Task:
51
  "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
 
164
  ```
165
 
166
  {%- if managed_agents and managed_agents.values() | list %}
167
+ You should also give tasks to team members whenever possible. They are very useful to solve tasks that require multiple modalities.
168
  Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
169
  Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
170
  Here is a list of the team members that you can call:
 
186
  7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
187
  8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
188
  9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
189
+ 10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description.
190
  You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
191
+ 11. Most of your provided tools for web browsing such as `navigate_browser` are asynchronous. You must run them with asyncio.run() or await syntax to get the result.
192
  12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
193
 
194
  Now Begin!
prompts/vlm_prompts.yaml CHANGED
@@ -2,11 +2,8 @@ system_prompt: |-
2
  You are an expert assistant who can solve any task using code blobs.
3
  You are also an expert at video/image processing, object detection, and text extraction from video/images.
4
  You will be given a task to solve as best you can.
5
- To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
6
- If a file path is provided in the task description, it is likely the path to a video/image file that you must use to solve the task.
7
- It is advised to search among your tools for one that you can use to load this video/image file from given path.
8
-
9
- Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
10
 
11
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
12
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
@@ -14,6 +11,10 @@ system_prompt: |-
14
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
15
  In the end you have to return a final answer using the `final_answer` tool.
16
 
 
 
 
 
17
  Here are a few examples using notional tools:
18
  ---
19
  Task: "Generate an image of the oldest person in this document."
 
2
  You are an expert assistant who can solve any task using code blobs.
3
  You are also an expert at video/image processing, object detection, and text extraction from video/images.
4
  You will be given a task to solve as best you can.
5
+
6
+ To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
 
 
 
7
 
8
  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
9
  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
 
11
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
12
  In the end you have to return a final answer using the `final_answer` tool.
13
 
14
+ Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
15
+ If a file path is provided in the task description, it is likely the path to a video/image file that you must use.
16
+ Use the tool named `get_video_from_file_path` or `get_image_from_file_path` to load this video/image file from the given path.
17
+
18
  Here are a few examples using notional tools:
19
  ---
20
  Task: "Generate an image of the oldest person in this document."
requirements.txt CHANGED
@@ -18,6 +18,7 @@ coloredlogs==15.0.1
18
  contourpy==1.3.2
19
  cycler==0.12.1
20
  dataclasses-json==0.6.7
 
21
  defusedxml==0.7.1
22
  deprecation==2.1.0
23
  dill==0.3.8
 
18
  contourpy==1.3.2
19
  cycler==0.12.1
20
  dataclasses-json==0.6.7
21
+ datasets==3.6.0
22
  defusedxml==0.7.1
23
  deprecation==2.1.0
24
  dill==0.3.8
scripts.sh CHANGED
@@ -88,3 +88,5 @@ apt-get -y install fonts-wqy-zenhei
88
  apt-get -y install fonts-tlwg-loma-otf
89
  apt-get -y install fonts-freefont-ttf
90
 
 
 
 
88
  apt-get -y install fonts-tlwg-loma-otf
89
  apt-get -y install fonts-freefont-ttf
90
 
91
+ apt-get -y install tesseract-ocr
92
+ apt-get -y install libtesseract-dev