Spaces:
Sleeping
Sleeping
Prompts engineering!
Browse files- audio_tools.py +39 -11
- mini_agents.py +2 -2
- prompts/audio_prompts.yaml +6 -5
- prompts/prompts.yaml +12 -23
- prompts/vlm_prompts.yaml +6 -5
- requirements.txt +1 -0
- scripts.sh +2 -0
audio_tools.py
CHANGED
@@ -18,26 +18,54 @@ class TranscribeAudioTool(Tool):
|
|
18 |
def setup(self):
|
19 |
self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def forward(self, audio: any) -> str:
|
22 |
try:
|
23 |
# Handle AudioSegment object
|
24 |
if isinstance(audio, AudioSegment):
|
25 |
-
#
|
26 |
-
|
27 |
-
audio.export(buffer, format="wav")
|
28 |
-
audio_data = buffer.getvalue()
|
29 |
# Handle base64 string
|
30 |
elif isinstance(audio, str):
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
else:
|
33 |
raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
|
34 |
|
35 |
-
# Create audio segment from the data
|
36 |
-
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
37 |
-
|
38 |
# Transcribe using the model
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
|
42 |
except Exception as e:
|
43 |
raise RuntimeError(f"Error in transcription: {str(e)}")
|
@@ -99,7 +127,7 @@ def audio_segmentation(audio: str, segment_length: int = 30) -> list:
|
|
99 |
audio: The audio file in base64 format
|
100 |
segment_length: Length of each segment in seconds
|
101 |
Returns:
|
102 |
-
List of audio segments in base64 format
|
103 |
"""
|
104 |
# Decode the base64 audio
|
105 |
audio_data = base64.b64decode(audio)
|
|
|
18 |
def setup(self):
|
19 |
self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
|
20 |
|
21 |
+
def _convert_audio_segment_to_wav(self, audio_segment: AudioSegment) -> bytes:
|
22 |
+
"""Convert AudioSegment to WAV format bytes"""
|
23 |
+
try:
|
24 |
+
# Ensure audio is in the correct format for Whisper
|
25 |
+
# Convert to mono if stereo
|
26 |
+
if audio_segment.channels > 1:
|
27 |
+
audio_segment = audio_segment.set_channels(1)
|
28 |
+
|
29 |
+
# Convert to 16kHz if different sample rate
|
30 |
+
if audio_segment.frame_rate != 16000:
|
31 |
+
audio_segment = audio_segment.set_frame_rate(16000)
|
32 |
+
|
33 |
+
# Convert to 16-bit if different bit depth
|
34 |
+
if audio_segment.sample_width != 2: # 2 bytes = 16 bits
|
35 |
+
audio_segment = audio_segment.set_sample_width(2)
|
36 |
+
|
37 |
+
# Export to WAV format
|
38 |
+
buffer = BytesIO()
|
39 |
+
audio_segment.export(buffer, format="wav")
|
40 |
+
return buffer.getvalue()
|
41 |
+
except Exception as e:
|
42 |
+
raise RuntimeError(f"Error converting audio segment: {str(e)}")
|
43 |
+
|
44 |
def forward(self, audio: any) -> str:
|
45 |
try:
|
46 |
# Handle AudioSegment object
|
47 |
if isinstance(audio, AudioSegment):
|
48 |
+
# Direct conversion to WAV bytes with proper format
|
49 |
+
audio_data = self._convert_audio_segment_to_wav(audio)
|
|
|
|
|
50 |
# Handle base64 string
|
51 |
elif isinstance(audio, str):
|
52 |
+
try:
|
53 |
+
# Decode base64 and convert to AudioSegment for format standardization
|
54 |
+
audio_data = base64.b64decode(audio)
|
55 |
+
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
56 |
+
# Convert to proper format for Whisper
|
57 |
+
audio_data = self._convert_audio_segment_to_wav(audio_segment)
|
58 |
+
except Exception as e:
|
59 |
+
raise ValueError(f"Invalid base64 audio data: {str(e)}")
|
60 |
else:
|
61 |
raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
|
62 |
|
|
|
|
|
|
|
63 |
# Transcribe using the model
|
64 |
+
try:
|
65 |
+
result = self.model.automatic_speech_recognition(audio_data)
|
66 |
+
return result["text"]
|
67 |
+
except Exception as e:
|
68 |
+
raise RuntimeError(f"Error in transcription: {str(e)}")
|
69 |
|
70 |
except Exception as e:
|
71 |
raise RuntimeError(f"Error in transcription: {str(e)}")
|
|
|
127 |
audio: The audio file in base64 format
|
128 |
segment_length: Length of each segment in seconds
|
129 |
Returns:
|
130 |
+
List of audio segments in base64 format. Each of these segments can be used as input for the `transcribe_audio` tool.
|
131 |
"""
|
132 |
# Decode the base64 audio
|
133 |
audio_data = base64.b64decode(audio)
|
mini_agents.py
CHANGED
@@ -60,7 +60,7 @@ AUTHORIZED_IMPORTS = [
|
|
60 |
audio_model = InferenceClientModel(
|
61 |
model_id=MODEL_CHOICES["audio"][0],
|
62 |
token=os.getenv("HUGGINGFACE_API_KEY"),
|
63 |
-
max_tokens=
|
64 |
)
|
65 |
|
66 |
audio_agent = CodeAgent(
|
@@ -76,7 +76,7 @@ audio_agent = CodeAgent(
|
|
76 |
vlm_model = InferenceClientModel(
|
77 |
model_id=MODEL_CHOICES["vlm"][0],
|
78 |
token=os.getenv("HUGGINGFACE_API_KEY"),
|
79 |
-
max_tokens=
|
80 |
)
|
81 |
|
82 |
vlm_agent = CodeAgent(
|
|
|
60 |
audio_model = InferenceClientModel(
|
61 |
model_id=MODEL_CHOICES["audio"][0],
|
62 |
token=os.getenv("HUGGINGFACE_API_KEY"),
|
63 |
+
max_tokens=18000
|
64 |
)
|
65 |
|
66 |
audio_agent = CodeAgent(
|
|
|
76 |
vlm_model = InferenceClientModel(
|
77 |
model_id=MODEL_CHOICES["vlm"][0],
|
78 |
token=os.getenv("HUGGINGFACE_API_KEY"),
|
79 |
+
max_tokens=18000
|
80 |
)
|
81 |
|
82 |
vlm_agent = CodeAgent(
|
prompts/audio_prompts.yaml
CHANGED
@@ -2,11 +2,8 @@ system_prompt: |-
|
|
2 |
You are an expert assistant who can solve any task using code blobs.
|
3 |
You are also an expert at audio processing and transcription.
|
4 |
You will be given a task to solve as best you can.
|
5 |
-
|
6 |
-
|
7 |
-
It is advised to search among your tools for one that you can use to load this audio file from given path.
|
8 |
-
|
9 |
-
Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
|
10 |
|
11 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
12 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
@@ -14,6 +11,10 @@ system_prompt: |-
|
|
14 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
15 |
In the end you have to return a final answer using the `final_answer` tool.
|
16 |
|
|
|
|
|
|
|
|
|
17 |
Here are a few examples using notional tools:
|
18 |
---
|
19 |
Task: "Generate an image of the oldest person in this document."
|
|
|
2 |
You are an expert assistant who can solve any task using code blobs.
|
3 |
You are also an expert at audio processing and transcription.
|
4 |
You will be given a task to solve as best you can.
|
5 |
+
|
6 |
+
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
|
|
|
|
|
|
|
7 |
|
8 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
9 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
|
|
11 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
12 |
In the end you have to return a final answer using the `final_answer` tool.
|
13 |
|
14 |
+
Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
|
15 |
+
If a file path is provided in the task description, it is likely the path to an audio file that you must use.
|
16 |
+
Use the tool named `get_audio_from_file_path` to load this audio file from the given path.
|
17 |
+
|
18 |
Here are a few examples using notional tools:
|
19 |
---
|
20 |
Task: "Generate an image of the oldest person in this document."
|
prompts/prompts.yaml
CHANGED
@@ -1,15 +1,14 @@
|
|
1 |
system_prompt: |-
|
2 |
|
3 |
You are a general AI assistant. I will ask you a question.
|
4 |
-
Report your thoughts, and
|
5 |
[YOUR FINAL ANSWER].
|
6 |
YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
|
7 |
-
OR as few words as possible
|
8 |
-
OR a comma separated list of numbers and/or strings.
|
9 |
|
10 |
If you are asked for a number, don't use comma to write your number
|
11 |
neither use units such as $ or percent sign unless specified otherwise.
|
12 |
-
If you are asked for a string, don't use articles
|
13 |
and write the digits in plain text unless specified otherwise.
|
14 |
If you are asked for a comma separated list, apply the above rules
|
15 |
depending of whether the element to be put in the list is a number or a string.
|
@@ -18,14 +17,14 @@ system_prompt: |-
|
|
18 |
|
19 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
20 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
21 |
-
During
|
22 |
-
|
|
|
|
|
23 |
In the end you have to return a final answer using the `final_answer` tool.
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
It is advised to have at least 2 steps in your plan.
|
28 |
-
It is also advised to have the last step as a double check of your answer.
|
29 |
|
30 |
You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
|
31 |
Here are a few examples using notional tools:
|
@@ -47,16 +46,6 @@ system_prompt: |-
|
|
47 |
final_answer(image)
|
48 |
```<end_code>
|
49 |
|
50 |
-
---
|
51 |
-
Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
|
52 |
-
|
53 |
-
Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
|
54 |
-
Code:
|
55 |
-
```py
|
56 |
-
result = 5 + 3 + 1294.678
|
57 |
-
final_answer(result)
|
58 |
-
```<end_code>
|
59 |
-
|
60 |
---
|
61 |
Task:
|
62 |
"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
|
@@ -175,7 +164,7 @@ system_prompt: |-
|
|
175 |
```
|
176 |
|
177 |
{%- if managed_agents and managed_agents.values() | list %}
|
178 |
-
You should also give tasks to team members whenever possible. They are very useful to solve
|
179 |
Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
|
180 |
Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
|
181 |
Here is a list of the team members that you can call:
|
@@ -197,9 +186,9 @@ system_prompt: |-
|
|
197 |
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
|
198 |
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
|
199 |
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
|
200 |
-
10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description
|
201 |
You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
|
202 |
-
11.
|
203 |
12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
|
204 |
|
205 |
Now Begin!
|
|
|
1 |
system_prompt: |-
|
2 |
|
3 |
You are a general AI assistant. I will ask you a question.
|
4 |
+
Report your thoughts, and answer with the following template:
|
5 |
[YOUR FINAL ANSWER].
|
6 |
YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
|
7 |
+
OR as few words as possible OR a comma separated list of numbers and/or strings.
|
|
|
8 |
|
9 |
If you are asked for a number, don't use comma to write your number
|
10 |
neither use units such as $ or percent sign unless specified otherwise.
|
11 |
+
If you are asked for a string, don't use articles or abbreviations (e.g. for cities),
|
12 |
and write the digits in plain text unless specified otherwise.
|
13 |
If you are asked for a comma separated list, apply the above rules
|
14 |
depending of whether the element to be put in the list is a number or a string.
|
|
|
17 |
|
18 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
19 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
20 |
+
During execution of the code sequence, you can use 'print()' to save whatever important information you will then need.
|
21 |
+
|
22 |
+
Then in the 'Observation:' sequence, you can observe these print outputs for important clues on how to proceed with next steps.
|
23 |
+
All information in the 'Observation:' sequence will be available as input for the next step.
|
24 |
In the end you have to return a final answer using the `final_answer` tool.
|
25 |
|
26 |
+
It is advised to have at least 2 steps in your plan. One step is not sufficient.
|
27 |
+
It is also advised to double check your answer before calling the `final_answer` tool.
|
|
|
|
|
28 |
|
29 |
You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
|
30 |
Here are a few examples using notional tools:
|
|
|
46 |
final_answer(image)
|
47 |
```<end_code>
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
---
|
50 |
Task:
|
51 |
"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
|
|
|
164 |
```
|
165 |
|
166 |
{%- if managed_agents and managed_agents.values() | list %}
|
167 |
+
You should also give tasks to team members whenever possible. They are very useful to solve tasks that require multiple modalities.
|
168 |
Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
|
169 |
Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
|
170 |
Here is a list of the team members that you can call:
|
|
|
186 |
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
|
187 |
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
|
188 |
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
|
189 |
+
10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description.
|
190 |
You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
|
191 |
+
11. Most of your provided tools for web browsing such as `navigate_browser` are asynchronous. You must run them with asyncio.run() or await syntax to get the result.
|
192 |
12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
|
193 |
|
194 |
Now Begin!
|
prompts/vlm_prompts.yaml
CHANGED
@@ -2,11 +2,8 @@ system_prompt: |-
|
|
2 |
You are an expert assistant who can solve any task using code blobs.
|
3 |
You are also an expert at video/image processing, object detection, and text extraction from video/images.
|
4 |
You will be given a task to solve as best you can.
|
5 |
-
|
6 |
-
|
7 |
-
It is advised to search among your tools for one that you can use to load this video/image file from given path.
|
8 |
-
|
9 |
-
Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
|
10 |
|
11 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
12 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
@@ -14,6 +11,10 @@ system_prompt: |-
|
|
14 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
15 |
In the end you have to return a final answer using the `final_answer` tool.
|
16 |
|
|
|
|
|
|
|
|
|
17 |
Here are a few examples using notional tools:
|
18 |
---
|
19 |
Task: "Generate an image of the oldest person in this document."
|
|
|
2 |
You are an expert assistant who can solve any task using code blobs.
|
3 |
You are also an expert at video/image processing, object detection, and text extraction from video/images.
|
4 |
You will be given a task to solve as best you can.
|
5 |
+
|
6 |
+
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
|
|
|
|
|
|
|
7 |
|
8 |
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
9 |
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
|
|
11 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
12 |
In the end you have to return a final answer using the `final_answer` tool.
|
13 |
|
14 |
+
Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
|
15 |
+
If a file path is provided in the task description, it is likely the path to a video/image file that you must use.
|
16 |
+
Use the tool named `get_video_from_file_path` or `get_image_from_file_path` to load this video/image file from the given path.
|
17 |
+
|
18 |
Here are a few examples using notional tools:
|
19 |
---
|
20 |
Task: "Generate an image of the oldest person in this document."
|
requirements.txt
CHANGED
@@ -18,6 +18,7 @@ coloredlogs==15.0.1
|
|
18 |
contourpy==1.3.2
|
19 |
cycler==0.12.1
|
20 |
dataclasses-json==0.6.7
|
|
|
21 |
defusedxml==0.7.1
|
22 |
deprecation==2.1.0
|
23 |
dill==0.3.8
|
|
|
18 |
contourpy==1.3.2
|
19 |
cycler==0.12.1
|
20 |
dataclasses-json==0.6.7
|
21 |
+
datasets==3.6.0
|
22 |
defusedxml==0.7.1
|
23 |
deprecation==2.1.0
|
24 |
dill==0.3.8
|
scripts.sh
CHANGED
@@ -88,3 +88,5 @@ apt-get -y install fonts-wqy-zenhei
|
|
88 |
apt-get -y install fonts-tlwg-loma-otf
|
89 |
apt-get -y install fonts-freefont-ttf
|
90 |
|
|
|
|
|
|
88 |
apt-get -y install fonts-tlwg-loma-otf
|
89 |
apt-get -y install fonts-freefont-ttf
|
90 |
|
91 |
+
apt-get -y install tesseract-ocr
|
92 |
+
apt-get -y install libtesseract-dev
|