huytofu92 commited on
Commit
30ffa0e
·
1 Parent(s): 280d2e0

Add file support and youtube transcript 2nd choice

Browse files
Files changed (9) hide show
  1. .gitattributes +4 -0
  2. app.py +5 -0
  3. audio_tools.py +8 -2
  4. community_tools.py +15 -4
  5. load_data.py +8 -0
  6. mini_agents.py +4 -4
  7. requirements.txt +1 -0
  8. utils.py +71 -0
  9. vlm_tools.py +21 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
39
+ *.pptx filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import requests
4
  import pandas as pd
5
  from mini_agents import master_agent
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
@@ -77,10 +78,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None, mock_submission: bool =
77
  for item in questions_data:
78
  task_id = item.get("task_id")
79
  question_text = item.get("question")
 
80
  if not task_id or question_text is None:
81
  print(f"Skipping item with missing task_id or question: {item}")
82
  continue
83
  try:
 
 
 
84
  submitted_answer = agent(question_text)
85
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
86
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
3
  import requests
4
  import pandas as pd
5
  from mini_agents import master_agent
6
+ from utils import get_full_file_path
7
 
8
  # (Keep Constants as is)
9
  # --- Constants ---
 
78
  for item in questions_data:
79
  task_id = item.get("task_id")
80
  question_text = item.get("question")
81
+ file_path = get_full_file_path(task_id)
82
  if not task_id or question_text is None:
83
  print(f"Skipping item with missing task_id or question: {item}")
84
  continue
85
  try:
86
+ question_text = question_text + "\n\nHere is the task_id for this question: " + task_id
87
+ if file_path:
88
+ question_text = question_text + f"\n\nHere is also the path to the file for the task (file name matches with task ID and is not in plain English): {file_path}"
89
  submitted_answer = agent(question_text)
90
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
91
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
audio_tools.py CHANGED
@@ -43,8 +43,14 @@ def audio_to_base64(file_path_or_key: str, state: dict) -> str:
43
  file_path = file_path_or_key
44
 
45
  # Load the audio file
46
- audio = AudioSegment.from_file(file_path)
47
-
 
 
 
 
 
 
48
  # Export the audio to a BytesIO object
49
  buffer = BytesIO()
50
  audio.export(buffer, format="wav") # You can change the format if needed
 
43
  file_path = file_path_or_key
44
 
45
  # Load the audio file
46
+ try:
47
+ audio = AudioSegment.from_file(file_path)
48
+ except Exception as e:
49
+ current_file_path = os.path.abspath(__file__)
50
+ current_file_dir = os.path.dirname(current_file_path)
51
+ file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
52
+ audio = AudioSegment.from_file(file_path)
53
+
54
  # Export the audio to a BytesIO object
55
  buffer = BytesIO()
56
  audio.export(buffer, format="wav") # You can change the format if needed
community_tools.py CHANGED
@@ -2,6 +2,7 @@ from langchain_google_community import GooglePlacesTool
2
  from langchain_community.agent_toolkits.load_tools import load_tools
3
  from langchain_community.document_loaders import YoutubeLoader
4
  from smolagents.tools import Tool, tool
 
5
 
6
  google_map_tool = Tool.from_langchain(GooglePlacesTool())
7
 
@@ -24,7 +25,17 @@ def get_youtube_transcript_from_url(video_url: str)->str:
24
  The transcript of the YouTube video as a string
25
  """
26
  video_id = video_url.split("=")[1]
27
- youtube_loader = YoutubeLoader(video_id=video_id)
28
- docs = youtube_loader.load()
29
- transcript = docs[0].page_content
30
- return transcript
 
 
 
 
 
 
 
 
 
 
 
2
  from langchain_community.agent_toolkits.load_tools import load_tools
3
  from langchain_community.document_loaders import YoutubeLoader
4
  from smolagents.tools import Tool, tool
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
 
7
  google_map_tool = Tool.from_langchain(GooglePlacesTool())
8
 
 
25
  The transcript of the YouTube video as a string
26
  """
27
  video_id = video_url.split("=")[1]
28
+ try:
29
+ ytt_api = YouTubeTranscriptApi()
30
+ fetched_transcript = ytt_api.fetch(video_id)
31
+
32
+ # is iterable
33
+ transcript = ""
34
+ for snippet in fetched_transcript:
35
+ transcript += f"{snippet['text']}\n"
36
+ return transcript
37
+ except Exception as e:
38
+ youtube_loader = YoutubeLoader(video_id=video_id)
39
+ docs = youtube_loader.load()
40
+ transcript = docs[0].page_content
41
+ return transcript
load_data.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+
3
+ def download_dataset(dataset_name: str, name: str):
4
+ dataset = datasets.load_dataset(dataset_name, name, trust_remote_code=True)
5
+ return dataset
6
+
7
+ dataset = download_dataset("gaia-benchmark/GAIA", "2023_all")
8
+ dataset.save_to_disk("GAIA_2023_all")
mini_agents.py CHANGED
@@ -2,7 +2,7 @@ from smolagents import CodeAgent, InferenceClientModel
2
  from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
3
  from tools import tavily_search_tool, visit_webpage_tool
4
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
5
- from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video
6
  from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
7
  from community_tools import community_tools, get_youtube_transcript_from_url
8
  import os
@@ -53,10 +53,10 @@ vlm_model = InferenceClientModel(
53
 
54
  vlm_agent = CodeAgent(
55
  model=vlm_model,
56
- tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video],
57
  max_steps=6,
58
  # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
59
- additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io"],
60
  name="vlm_agent",
61
  description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
62
  )
@@ -127,7 +127,7 @@ master_agent = CodeAgent(
127
  tools=[sort_list, get_youtube_transcript_from_url, *community_tools, tavily_search_tool, visit_webpage_tool],
128
  add_base_tools=True,
129
  max_steps=20,
130
- additional_authorized_imports=["math", "pandas", "json", "numpy", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'sklearn', 'scipy', 'datetime', 'typing'],
131
  verbosity_level=logging.INFO,
132
  planning_interval=4,
133
  prompt_templates=PROMPT_TEMPLATE["master_agent"],
 
2
  from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
3
  from tools import tavily_search_tool, visit_webpage_tool
4
  from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
5
+ from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file
6
  from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
7
  from community_tools import community_tools, get_youtube_transcript_from_url
8
  import os
 
53
 
54
  vlm_agent = CodeAgent(
55
  model=vlm_model,
56
+ tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file],
57
  max_steps=6,
58
  # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
59
+ additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
60
  name="vlm_agent",
61
  description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
62
  )
 
127
  tools=[sort_list, get_youtube_transcript_from_url, *community_tools, tavily_search_tool, visit_webpage_tool],
128
  add_base_tools=True,
129
  max_steps=20,
130
+ additional_authorized_imports=["math", "pandas", "json", "numpy", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'sklearn', 'scipy', 'datetime', 'typing', 'markdownify', 'requests', 'json'],
131
  verbosity_level=logging.INFO,
132
  planning_interval=4,
133
  prompt_templates=PROMPT_TEMPLATE["master_agent"],
requirements.txt CHANGED
@@ -65,6 +65,7 @@ langchain-text-splitters==0.3.8
65
  langsmith==0.3.42
66
  lxml==5.4.0
67
  markdown-it-py==3.0.0
 
68
  MarkupSafe==3.0.2
69
  marshmallow==3.26.1
70
  matplotlib==3.10.3
 
65
  langsmith==0.3.42
66
  lxml==5.4.0
67
  markdown-it-py==3.0.0
68
+ markdownify==1.1.0
69
  MarkupSafe==3.0.2
70
  marshmallow==3.26.1
71
  matplotlib==3.10.3
utils.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Optional
4
+ from smolagents.tools import tool
5
+
6
+ def find_file_by_task_id(task_id: str, metadata_path: str = "Final_Assignment_Template/validation/metadata.jsonl") -> Optional[str]:
7
+ """
8
+ Search for a filename that matches a given task_id in the metadata.jsonl file.
9
+
10
+ Args:
11
+ task_id (str): The task_id to search for
12
+ metadata_path (str): Path to the metadata.jsonl file. Defaults to the validation directory path.
13
+
14
+ Returns:
15
+ Optional[str]: The filename if found, None if not found or if task_id has no associated file
16
+
17
+ Example:
18
+ >>> find_file_by_task_id("32102e3e-d12a-4209-9163-7b3a104efe5d")
19
+ "32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx"
20
+ """
21
+ if not os.path.exists(metadata_path):
22
+ raise FileNotFoundError(f"Metadata file not found at {metadata_path}")
23
+
24
+ with open(metadata_path, 'r', encoding='utf-8') as f:
25
+ for line in f:
26
+ try:
27
+ data = json.loads(line.strip())
28
+ if data.get('task_id') == task_id:
29
+ filename = data.get('file_name', '')
30
+ return filename if filename else None
31
+ except json.JSONDecodeError:
32
+ continue
33
+
34
+ return None
35
+
36
+ def get_full_file_path(task_id: str, base_dir: str = "Final_Assignment_Template/validation") -> Optional[str]:
37
+ """
38
+ Get the full file path for a given task_id if it exists.
39
+
40
+ Args:
41
+ task_id (str): The task_id to search for
42
+ base_dir (str): Base directory where files are stored. Defaults to validation directory.
43
+
44
+ Returns:
45
+ Optional[str]: Full path to the file if found, None if not found
46
+
47
+ Example:
48
+ >>> get_full_file_path("32102e3e-d12a-4209-9163-7b3a104efe5d")
49
+ "Final_Assignment_Template/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx"
50
+ """
51
+ filename = find_file_by_task_id(task_id)
52
+ if not filename:
53
+ return None
54
+
55
+ full_path = os.path.join(base_dir, filename)
56
+ return full_path if os.path.exists(full_path) else None
57
+
58
+ @tool
59
+ def load_file_from_task_id(task_id: str) -> str:
60
+ """
61
+ Load a file related to a given task_id if it exists.
62
+ Args:
63
+ task_id: The task_id to load the file for
64
+ Returns:
65
+ The file content if found, None if not found
66
+ """
67
+ file_path = get_full_file_path(task_id)
68
+ if not file_path:
69
+ return "File not found"
70
+ with open(file_path, 'r') as file:
71
+ return file.read()
vlm_tools.py CHANGED
@@ -4,6 +4,7 @@ import pytesseract
4
  import requests
5
  import base64
6
  import onnxruntime
 
7
  from io import BytesIO
8
  from PIL import Image
9
  from langchain_core.tools import tool as langchain_tool
@@ -103,6 +104,26 @@ def download_image(image_url: str)->str:
103
  image = base64.b64encode(response.content).decode('utf-8')
104
  return image
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  @tool
107
  def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
108
  """
 
4
  import requests
5
  import base64
6
  import onnxruntime
7
+ import os
8
  from io import BytesIO
9
  from PIL import Image
10
  from langchain_core.tools import tool as langchain_tool
 
104
  image = base64.b64encode(response.content).decode('utf-8')
105
  return image
106
 
107
+ @tool
108
+ def get_image_from_file(file_path: str)->str:
109
+ """
110
+ Get an image from a file
111
+ Args:
112
+ file_path: The path to the file
113
+ Returns:
114
+ The image as a base64 string
115
+ """
116
+ try:
117
+ with open(file_path, 'rb') as image_file:
118
+ image = base64.b64encode(image_file.read()).decode('utf-8')
119
+ except Exception as e:
120
+ current_file_path = os.path.abspath(__file__)
121
+ current_file_dir = os.path.dirname(current_file_path)
122
+ file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
123
+ with open(file_path, 'rb') as image_file:
124
+ image = base64.b64encode(image_file.read()).decode('utf-8')
125
+ return image
126
+
127
  @tool
128
  def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
129
  """