huytofu92 commited on
Commit
915154c
·
1 Parent(s): 5b6dc13

Agents' guides and prompts

Browse files
Files changed (2) hide show
  1. mini_agents.py +6 -6
  2. prompts.yaml +3 -1
mini_agents.py CHANGED
@@ -2,7 +2,7 @@ from smolagents import CodeAgent, InferenceClientModel
2
  from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
3
  from tools import tavily_search_tool, visit_webpage_tool
4
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
5
- from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file
6
  from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
7
  from community_tools import community_tools, get_youtube_transcript_from_url
8
  import os
@@ -41,9 +41,9 @@ audio_agent = CodeAgent(
41
  tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
42
  max_steps=6,
43
  # prompt_templates=PROMPT_TEMPLATE["audio_agent"],
44
- additional_authorized_imports=["pytube", "pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
45
  name="audio_agent",
46
- description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it."
47
  )
48
 
49
  vlm_model = InferenceClientModel(
@@ -53,12 +53,12 @@ vlm_model = InferenceClientModel(
53
 
54
  vlm_agent = CodeAgent(
55
  model=vlm_model,
56
- tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file],
57
  max_steps=6,
58
  # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
59
- additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'youtube_dl', 'bs4'],
60
  name="vlm_agent",
61
- description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
62
  )
63
 
64
  arithmetic_model = InferenceClientModel(
 
2
  from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
3
  from tools import tavily_search_tool, visit_webpage_tool
4
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
5
+ from vlm_tools import image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path
6
  from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
7
  from community_tools import community_tools, get_youtube_transcript_from_url
8
  import os
 
41
  tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
42
  max_steps=6,
43
  # prompt_templates=PROMPT_TEMPLATE["audio_agent"],
44
+ additional_authorized_imports=["pytube", "pytube3", "youtube_dl", "pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
45
  name="audio_agent",
46
+ description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it. It cannot process videos."
47
  )
48
 
49
  vlm_model = InferenceClientModel(
 
53
 
54
  vlm_agent = CodeAgent(
55
  model=vlm_model,
56
+ tools=[image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path],
57
  max_steps=6,
58
  # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
59
+ additional_authorized_imports=["pytube", "pytube3", "youtube_dl", "cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'youtube_dl', 'bs4'],
60
  name="vlm_agent",
61
+ description="This agent is responsible for downloading images or videos, processing images or videos, detecting objects in them and extracting text from them. It cannot process audios."
62
  )
63
 
64
  arithmetic_model = InferenceClientModel(
prompts.yaml CHANGED
@@ -192,7 +192,8 @@ system_prompt: |-
192
  7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
193
  8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
194
  9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
195
- 10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
 
196
 
197
  Now Begin!
198
  planning:
@@ -320,6 +321,7 @@ managed_agent:
320
  {{task}}
321
  ---
322
  You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
 
323
 
324
  Your final_answer WILL HAVE to contain these parts:
325
  ### 1. Task outcome (short version):
 
192
  7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
193
  8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
194
  9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
195
+ 10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description (example: abcxyz.mp3). You must pass this file path to your managed agents for them to use as arguments to their tools.
196
+ 11. Don't give up! You're in charge of solving the task, not providing directions to solve it.
197
 
198
  Now Begin!
199
  planning:
 
321
  {{task}}
322
  ---
323
  You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
324
+ Your manager may pass you a file path to an audio or video file. You must use this file path as an argument to your tools.
325
 
326
  Your final_answer WILL HAVE to contain these parts:
327
  ### 1. Task outcome (short version):