Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 23

Commit

204b035

1 Parent(s): d812eb3

Prompts engineering!

Browse files

Files changed (7) hide show

audio_tools.py +39 -11
mini_agents.py +2 -2
prompts/audio_prompts.yaml +6 -5
prompts/prompts.yaml +12 -23
prompts/vlm_prompts.yaml +6 -5
requirements.txt +1 -0
scripts.sh +2 -0

audio_tools.py CHANGED Viewed

@@ -18,26 +18,54 @@ class TranscribeAudioTool(Tool):
     def setup(self):
         self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
     def forward(self, audio: any) -> str:
         try:
             # Handle AudioSegment object
             if isinstance(audio, AudioSegment):
-                # Convert AudioSegment to base64
-                buffer = BytesIO()
-                audio.export(buffer, format="wav")
-                audio_data = buffer.getvalue()
             # Handle base64 string
             elif isinstance(audio, str):
-                audio_data = base64.b64decode(audio)
             else:
                 raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
-            # Create audio segment from the data
-            audio_segment = AudioSegment.from_file(BytesIO(audio_data))
             # Transcribe using the model
-            result = self.model.automatic_speech_recognition(audio_segment)
-            return result["text"]
         except Exception as e:
             raise RuntimeError(f"Error in transcription: {str(e)}")
@@ -99,7 +127,7 @@ def audio_segmentation(audio: str, segment_length: int = 30) -> list:
         audio: The audio file in base64 format
         segment_length: Length of each segment in seconds
     Returns:
-        List of audio segments in base64 format
     """
     # Decode the base64 audio
     audio_data = base64.b64decode(audio)

     def setup(self):
         self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
+    def _convert_audio_segment_to_wav(self, audio_segment: AudioSegment) -> bytes:
+        """Convert AudioSegment to WAV format bytes"""
+        try:
+            # Ensure audio is in the correct format for Whisper
+            # Convert to mono if stereo
+            if audio_segment.channels > 1:
+                audio_segment = audio_segment.set_channels(1)
+            # Convert to 16kHz if different sample rate
+            if audio_segment.frame_rate != 16000:
+                audio_segment = audio_segment.set_frame_rate(16000)
+            # Convert to 16-bit if different bit depth
+            if audio_segment.sample_width != 2:  # 2 bytes = 16 bits
+                audio_segment = audio_segment.set_sample_width(2)
+            # Export to WAV format
+            buffer = BytesIO()
+            audio_segment.export(buffer, format="wav")
+            return buffer.getvalue()
+        except Exception as e:
+            raise RuntimeError(f"Error converting audio segment: {str(e)}")
     def forward(self, audio: any) -> str:
         try:
             # Handle AudioSegment object
             if isinstance(audio, AudioSegment):
+                # Direct conversion to WAV bytes with proper format
+                audio_data = self._convert_audio_segment_to_wav(audio)
             # Handle base64 string
             elif isinstance(audio, str):
+                try:
+                    # Decode base64 and convert to AudioSegment for format standardization
+                    audio_data = base64.b64decode(audio)
+                    audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+                    # Convert to proper format for Whisper
+                    audio_data = self._convert_audio_segment_to_wav(audio_segment)
+                except Exception as e:
+                    raise ValueError(f"Invalid base64 audio data: {str(e)}")
             else:
                 raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
             # Transcribe using the model
+            try:
+                result = self.model.automatic_speech_recognition(audio_data)
+                return result["text"]
+            except Exception as e:
+                raise RuntimeError(f"Error in transcription: {str(e)}")
         except Exception as e:
             raise RuntimeError(f"Error in transcription: {str(e)}")
         audio: The audio file in base64 format
         segment_length: Length of each segment in seconds
     Returns:
+        List of audio segments in base64 format. Each of these segments can be used as input for the `transcribe_audio` tool.
     """
     # Decode the base64 audio
     audio_data = base64.b64decode(audio)

mini_agents.py CHANGED Viewed

@@ -60,7 +60,7 @@ AUTHORIZED_IMPORTS = [
 audio_model = InferenceClientModel(
     model_id=MODEL_CHOICES["audio"][0],
     token=os.getenv("HUGGINGFACE_API_KEY"),
-    max_tokens=12000
 )
 audio_agent = CodeAgent(
@@ -76,7 +76,7 @@ audio_agent = CodeAgent(
 vlm_model = InferenceClientModel(
     model_id=MODEL_CHOICES["vlm"][0],
     token=os.getenv("HUGGINGFACE_API_KEY"),
-    max_tokens=12000
 )
 vlm_agent = CodeAgent(

 audio_model = InferenceClientModel(
     model_id=MODEL_CHOICES["audio"][0],
     token=os.getenv("HUGGINGFACE_API_KEY"),
+    max_tokens=18000
 )
 audio_agent = CodeAgent(
 vlm_model = InferenceClientModel(
     model_id=MODEL_CHOICES["vlm"][0],
     token=os.getenv("HUGGINGFACE_API_KEY"),
+    max_tokens=18000
 )
 vlm_agent = CodeAgent(

prompts/audio_prompts.yaml CHANGED Viewed

@@ -2,11 +2,8 @@ system_prompt: |-
   You are an expert assistant who can solve any task using code blobs.
   You are also an expert at audio processing and transcription.
   You will be given a task to solve as best you can.
-  To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
-  If a file path is provided in the task description, it is likely the path to an audio file that you must use to solve the task.
-  It is advised to search among your tools for one that you can use to load this audio file from given path.
-  Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
@@ -14,6 +11,10 @@ system_prompt: |-
   These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
   Here are a few examples using notional tools:
   ---
   Task: "Generate an image of the oldest person in this document."

   You are an expert assistant who can solve any task using code blobs.
   You are also an expert at audio processing and transcription.
   You will be given a task to solve as best you can.
+  To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
   These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
+  Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
+  If a file path is provided in the task description, it is likely the path to an audio file that you must use.
+  Use the tool named `get_audio_from_file_path` to load this audio file from the given path.
   Here are a few examples using notional tools:
   ---
   Task: "Generate an image of the oldest person in this document."

prompts/prompts.yaml CHANGED Viewed

@@ -1,15 +1,14 @@
 system_prompt: |-
   You are a general AI assistant. I will ask you a question.
-  Report your thoughts, and finish your answer with the following template:
   [YOUR FINAL ANSWER].
   YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
-  OR as few words as possible (fewer than 10 words)
-  OR a comma separated list of numbers and/or strings.
   If you are asked for a number, don't use comma to write your number
   neither use units such as $ or percent sign unless specified otherwise.
-  If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities),
   and write the digits in plain text unless specified otherwise.
   If you are asked for a comma separated list, apply the above rules
   depending of whether the element to be put in the list is a number or a string.
@@ -18,14 +17,14 @@ system_prompt: |-
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
-  During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
-  Try to solve the task in the most efficient way possible.
-  However, be careful when you only need one step to solve the task.
-  It is advised to have at least 2 steps in your plan.
-  It is also advised to have the last step as a double check of your answer.
   You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
   Here are a few examples using notional tools:
@@ -47,16 +46,6 @@ system_prompt: |-
   final_answer(image)
   ```<end_code>
-  ---
-  Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
-  Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
-  Code:
-  ```py
-  result = 5 + 3 + 1294.678
-  final_answer(result)
-  ```<end_code>
   ---
   Task:
   "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
@@ -175,7 +164,7 @@ system_prompt: |-
   ```
   {%- if managed_agents and managed_agents.values() | list %}
-  You should also give tasks to team members whenever possible. They are very useful to solve complex tasks especially those pertaining to multiple modalities.
   Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
   Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
   Here is a list of the team members that you can call:
@@ -197,9 +186,9 @@ system_prompt: |-
   7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
   8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
   9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
-  10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description and has arbitrary name (example: path_to_file/valuation/abcxyz.mp3).
   You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
-  11. Among the tools given to you are browser tools. You can use these tools to visit websites, scroll through pages, and extract information from them.
   12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
   Now Begin!

 system_prompt: |-
   You are a general AI assistant. I will ask you a question.
+  Report your thoughts, and answer with the following template:
   [YOUR FINAL ANSWER].
   YOUR FINAL ANSWER must not include the phrase "FINAL ANSWER" and should be a number
+  OR as few words as possible OR a comma separated list of numbers and/or strings.
   If you are asked for a number, don't use comma to write your number
   neither use units such as $ or percent sign unless specified otherwise.
+  If you are asked for a string, don't use articles or abbreviations (e.g. for cities),
   and write the digits in plain text unless specified otherwise.
   If you are asked for a comma separated list, apply the above rules
   depending of whether the element to be put in the list is a number or a string.
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
+  During execution of the code sequence, you can use 'print()' to save whatever important information you will then need.
+  Then in the 'Observation:' sequence, you can observe these print outputs for important clues on how to proceed with next steps.
+  All information in the 'Observation:' sequence will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
+  It is advised to have at least 2 steps in your plan. One step is not sufficient.
+  It is also advised to double check your answer before calling the `final_answer` tool.
   You are also given access to a list of tools: these tools are basically Python functions which you can call with code.
   Here are a few examples using notional tools:
   final_answer(image)
   ```<end_code>
   ---
   Task:
   "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
   ```
   {%- if managed_agents and managed_agents.values() | list %}
+  You should also give tasks to team members whenever possible. They are very useful to solve tasks that require multiple modalities.
   Calling a team member works the same as for calling a tool: you can simply pass the task description as a string to the member without specifying any argument.
   Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
   Here is a list of the team members that you can call:
   7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
   8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
   9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+  10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description.
   You must pass this entire file path to your managed agents for them to use as arguments to their tools. Example instruction to managed agent: "please help to transcribe this audio file (Path = path_to_file/valuation/abcxyz.mp3)"
+  11. Most of your provided tools for web browsing such as `navigate_browser` are asynchronous. You must run them with asyncio.run() or await syntax to get the result.
   12. Don't give up! You're in charge of solving the task, not providing directions to solve it.
   Now Begin!

prompts/vlm_prompts.yaml CHANGED Viewed

@@ -2,11 +2,8 @@ system_prompt: |-
   You are an expert assistant who can solve any task using code blobs.
   You are also an expert at video/image processing, object detection, and text extraction from video/images.
   You will be given a task to solve as best you can.
-  To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
-  If a file path is provided in the task description, it is likely the path to a video/image file that you must use to solve the task.
-  It is advised to search among your tools for one that you can use to load this video/image file from given path.
-  Furthermore, to solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
@@ -14,6 +11,10 @@ system_prompt: |-
   These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
   Here are a few examples using notional tools:
   ---
   Task: "Generate an image of the oldest person in this document."

   You are an expert assistant who can solve any task using code blobs.
   You are also an expert at video/image processing, object detection, and text extraction from video/images.
   You will be given a task to solve as best you can.
+  To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
   At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
   Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
   These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
   In the end you have to return a final answer using the `final_answer` tool.
+  Furthermore, to solve the task, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
+  If a file path is provided in the task description, it is likely the path to a video/image file that you must use.
+  Use the tool named `get_video_from_file_path` or `get_image_from_file_path` to load this video/image file from the given path.
   Here are a few examples using notional tools:
   ---
   Task: "Generate an image of the oldest person in this document."

requirements.txt CHANGED Viewed

@@ -18,6 +18,7 @@ coloredlogs==15.0.1
 contourpy==1.3.2
 cycler==0.12.1
 dataclasses-json==0.6.7
 defusedxml==0.7.1
 deprecation==2.1.0
 dill==0.3.8

 contourpy==1.3.2
 cycler==0.12.1
 dataclasses-json==0.6.7
+datasets==3.6.0
 defusedxml==0.7.1
 deprecation==2.1.0
 dill==0.3.8

scripts.sh CHANGED Viewed

@@ -88,3 +88,5 @@ apt-get -y install fonts-wqy-zenhei
 apt-get -y install fonts-tlwg-loma-otf
 apt-get -y install fonts-freefont-ttf

 apt-get -y install fonts-tlwg-loma-otf
 apt-get -y install fonts-freefont-ttf
+apt-get -y install tesseract-ocr
+apt-get -y install libtesseract-dev