Spaces:
Sleeping
Sleeping
agent with search, file read, youtube
Browse files- .DS_Store +0 -0
- .gitignore +4 -0
- __init__.py +0 -0
- agent.py +69 -0
- app.py +39 -14
- prompt.py +72 -0
- tool.py +81 -0
- tools/__init__.py +0 -0
- tools/web_search.py +47 -0
- utils/__init__.py +0 -0
- utils/fetch_file.py +38 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
gaia_files
|
__init__.py
ADDED
File without changes
|
agent.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from smolagents import CodeAgent
|
3 |
+
from smolagents import OpenAIServerModel
|
4 |
+
from tool import fetch_webpage, read_file_tool, get_youtube_transcript
|
5 |
+
|
6 |
+
from smolagents import VisitWebpageTool, WikipediaSearchTool, PythonInterpreterTool, DuckDuckGoSearchTool, WebSearchTool, SpeechToTextTool
|
7 |
+
|
8 |
+
from prompt import gaia_prompt
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
openai_nano_model = OpenAIServerModel(
|
13 |
+
model_id="gpt-4.1-nano-2025-04-14",
|
14 |
+
# model_id="o3-mini-2025-01-31",
|
15 |
+
)
|
16 |
+
|
17 |
+
gaia_agent = CodeAgent(
|
18 |
+
model=openai_nano_model,
|
19 |
+
tools=[fetch_webpage, DuckDuckGoSearchTool(), PythonInterpreterTool(), read_file_tool, get_youtube_transcript], # WikipediaSearchTool(), VisitWebpageTool(max_output_length=60000)
|
20 |
+
max_steps=5,
|
21 |
+
verbosity_level=2,
|
22 |
+
additional_authorized_imports=["requests", "bs4", "pandas", "numpy", "markdownify"]
|
23 |
+
)
|
24 |
+
|
25 |
+
class GAIA_Agent:
|
26 |
+
def __init__(self):
|
27 |
+
self.system_prompt = gaia_prompt
|
28 |
+
self.agent = gaia_agent
|
29 |
+
|
30 |
+
def __call__(self, question: str) -> str:
|
31 |
+
|
32 |
+
try:
|
33 |
+
|
34 |
+
full_context = self.system_prompt + "\nTHE QUESTION:\n" + question
|
35 |
+
|
36 |
+
final_answer = self.agent.run(full_context)
|
37 |
+
return final_answer
|
38 |
+
except Exception as e:
|
39 |
+
error = f"An error occurred while processing the question: {e}"
|
40 |
+
print(error)
|
41 |
+
return error
|
42 |
+
|
43 |
+
# build context + append instructions and all
|
44 |
+
|
45 |
+
# clean answer function
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
pass
|
49 |
+
# gaia_agent.run("What is the weather in Mumbai?")
|
50 |
+
# answer = gaia_agent.run(
|
51 |
+
# f"""
|
52 |
+
# You are a general AI assistant. I will ask you a question. You can answer with the following template:[YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. Remember: GAIA requires exact answer matching. Just provide the factual answer.
|
53 |
+
|
54 |
+
# How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
|
55 |
+
# """
|
56 |
+
# )
|
57 |
+
|
58 |
+
# print(gaia_prompt)
|
59 |
+
# answer = gaia_agent.run("""
|
60 |
+
# You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
61 |
+
# You can search for results and then visit a webpage to get more information. Break down the problem into smaller sub-problems and solve them one by one.
|
62 |
+
# Think like a human.
|
63 |
+
|
64 |
+
# What is the final numeric output from the attached Python code?
|
65 |
+
|
66 |
+
# ----
|
67 |
+
|
68 |
+
# """)
|
69 |
+
# print(f"this is the final answer the gaia agent gave ---> {answer}")
|
app.py
CHANGED
@@ -3,6 +3,8 @@ import gradio as gr
|
|
3 |
import requests
|
4 |
import inspect
|
5 |
import pandas as pd
|
|
|
|
|
6 |
|
7 |
# (Keep Constants as is)
|
8 |
# --- Constants ---
|
@@ -37,10 +39,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
37 |
api_url = DEFAULT_API_URL
|
38 |
questions_url = f"{api_url}/questions"
|
39 |
submit_url = f"{api_url}/submit"
|
|
|
40 |
|
41 |
# 1. Instantiate Agent ( modify this part to create your agent)
|
42 |
try:
|
43 |
-
agent = BasicAgent()
|
|
|
44 |
except Exception as e:
|
45 |
print(f"Error instantiating agent: {e}")
|
46 |
return f"Error initializing agent: {e}", None
|
@@ -73,19 +77,39 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
73 |
results_log = []
|
74 |
answers_payload = []
|
75 |
print(f"Running agent on {len(questions_data)} questions...")
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
if
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
if not answers_payload:
|
91 |
print("Agent did not produce any answers to submit.")
|
@@ -95,6 +119,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
95 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
96 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
97 |
print(status_update)
|
|
|
98 |
|
99 |
# 5. Submit
|
100 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
|
|
3 |
import requests
|
4 |
import inspect
|
5 |
import pandas as pd
|
6 |
+
from utils.fetch_file import download_file_if_any
|
7 |
+
from agent import GAIA_Agent
|
8 |
|
9 |
# (Keep Constants as is)
|
10 |
# --- Constants ---
|
|
|
39 |
api_url = DEFAULT_API_URL
|
40 |
questions_url = f"{api_url}/questions"
|
41 |
submit_url = f"{api_url}/submit"
|
42 |
+
file_path = f"{api_url}/files"
|
43 |
|
44 |
# 1. Instantiate Agent ( modify this part to create your agent)
|
45 |
try:
|
46 |
+
# agent = BasicAgent()
|
47 |
+
agent = GAIA_Agent()
|
48 |
except Exception as e:
|
49 |
print(f"Error instantiating agent: {e}")
|
50 |
return f"Error initializing agent: {e}", None
|
|
|
77 |
results_log = []
|
78 |
answers_payload = []
|
79 |
print(f"Running agent on {len(questions_data)} questions...")
|
80 |
+
to_answer = [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
|
81 |
+
for index, item in enumerate(questions_data):
|
82 |
+
|
83 |
+
if index + 1 in to_answer:
|
84 |
+
task_id = item.get("task_id")
|
85 |
+
question_text = item.get("question")
|
86 |
+
file_name = item.get('file_name')
|
87 |
+
file_path = None
|
88 |
+
|
89 |
+
if file_name:
|
90 |
+
try:
|
91 |
+
file_path = download_file_if_any(task_id)
|
92 |
+
except Exception as e:
|
93 |
+
file_path = None
|
94 |
+
|
95 |
+
if not task_id or question_text is None:
|
96 |
+
print(f"Skipping item with missing task_id or question: {item}")
|
97 |
+
continue
|
98 |
+
try:
|
99 |
+
agent_question = question_text
|
100 |
+
if file_path:
|
101 |
+
agent_question += f"\n\nA file was downloaded for this task and saved locally at:\n {file_path}\n"
|
102 |
+
|
103 |
+
submitted_answer = agent(agent_question)
|
104 |
+
|
105 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
106 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error running agent on task {task_id}: {e}")
|
109 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
110 |
+
|
111 |
+
else:
|
112 |
+
print("Do not answer")
|
113 |
|
114 |
if not answers_payload:
|
115 |
print("Agent did not produce any answers to submit.")
|
|
|
119 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
120 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
121 |
print(status_update)
|
122 |
+
print(submission_data)
|
123 |
|
124 |
# 5. Submit
|
125 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
prompt.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
final_answer = """When answering, provide ONLY the precise answer requested.
|
4 |
+
Do not include explanations, steps, reasoning, or additional text.
|
5 |
+
Be direct and specific. GAIA benchmark requires exact matching answers.
|
6 |
+
"""
|
7 |
+
|
8 |
+
final_answer1 = prompt = """
|
9 |
+
Here is a user-given task and the agent steps: {agent_memory.get_succinct_steps()}. Now here is the FINAL ANSWER that was given:
|
10 |
+
{final_answer}
|
11 |
+
Ensure the FINAL ANSWER is in the right format as asked for by the task. Here are the instructions that you need to evaluate:
|
12 |
+
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
13 |
+
If you are asked for a number, don't use commas to write your number. Don't use units such as $ or percent sign unless specified otherwise. Write your number in Arabic numbers (such as 9 or 3 or 1093) unless specified otherwise.
|
14 |
+
If you are asked for a currency in your answer, use the symbol for that currency. For example, if you are asked for the answers in USD, an example answer would be $40.00
|
15 |
+
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
16 |
+
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
17 |
+
If you are asked for a comma separated list, ensure you only return the content of that list, and NOT the brackets '[]'
|
18 |
+
First list reasons why it is/is not in the correct format and then write your final decision: PASS in caps lock if it is satisfactory, FAIL if it is not.
|
19 |
+
"""
|
20 |
+
|
21 |
+
|
22 |
+
sys_prompt = """You are the top agent of a multi-agent system that can answer questions by coordinating the work of other agents.
|
23 |
+
You will receive a question and you will decide which agent to use to answer it.
|
24 |
+
You can use the web_agent to search the web for information and for fetching the content of a web page, or the audiovideo_agent to extract information from video or audio files.
|
25 |
+
You can also use your own knowledge to answer the question.
|
26 |
+
You need to respect the output format that is given to you.
|
27 |
+
Finding the correct answer to the question need reasoning and plannig, read the question carrefully, think step by step and do not skip any steps.
|
28 |
+
"""
|
29 |
+
|
30 |
+
sys_prompt1 = """
|
31 |
+
You are a helpful assistant tasked with answering questions using a set of tools.
|
32 |
+
|
33 |
+
Your final answer must strictly follow this format:
|
34 |
+
FINAL ANSWER: [ANSWER]
|
35 |
+
|
36 |
+
Only write the answer in that exact format. Do not explain anything. Do not include any other text.
|
37 |
+
|
38 |
+
If you are provided with a similar question and its final answer, and the current question is **exactly the same**, then simply return the same final answer without using any tools.
|
39 |
+
|
40 |
+
Only use tools if the current question is different from the similar one.
|
41 |
+
|
42 |
+
Examples:
|
43 |
+
- FINAL ANSWER: FunkMonk
|
44 |
+
- FINAL ANSWER: Paris
|
45 |
+
- FINAL ANSWER: 128
|
46 |
+
|
47 |
+
If you do not follow this format exactly, your response will be considered incorrect."""
|
48 |
+
|
49 |
+
|
50 |
+
gaia_prompt = """
|
51 |
+
You are a helpful general AI assistant. You are tasked with answering questions from GAIA benchmark.
|
52 |
+
You can answer questions using a set of tools or you can also use your own knowledge to answer them.
|
53 |
+
|
54 |
+
Finding the correct answer to the question need reasoning and plannig, read the question carrefully, think step by step and do not skip any steps.
|
55 |
+
You have access to various tools, including, but not limited to, web search, visiting webpage, executing python code, reading files, transcribing audio, analysing images.
|
56 |
+
Break down the problem into smaller sub-problems and solve them one by one.
|
57 |
+
|
58 |
+
If the question has associated file, you can be use it to answer the question.
|
59 |
+
|
60 |
+
Your final answer must strictly follow this format:
|
61 |
+
[FINAL ANSWER]
|
62 |
+
|
63 |
+
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
64 |
+
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
65 |
+
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
66 |
+
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
67 |
+
Reminder! When giving final answer, provide ONLY the precise answer requested. Do not include explanations, steps, reasoning, or additional text. Be direct and specific.
|
68 |
+
GAIA benchmark requires exact matching answers. If you do not follow this format exactly, your response will be considered incorrect.
|
69 |
+
|
70 |
+
BEFORE GIVING THE FINAL ANSWER DOUBLE CHECK THE EXACT FORMAT IN WHICH THE ANSWER IS NEEDED.
|
71 |
+
|
72 |
+
"""
|
tool.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from smolagents import Tool, tool
|
2 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
3 |
+
|
4 |
+
@tool
|
5 |
+
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
|
6 |
+
"""
|
7 |
+
Visit a website / url and fetch the content of the webpage.
|
8 |
+
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
|
9 |
+
Args:
|
10 |
+
url (str): The URL to fetch.
|
11 |
+
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
|
12 |
+
Returns:
|
13 |
+
str: The HTML content of the URL.
|
14 |
+
"""
|
15 |
+
import requests
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
from markdownify import markdownify as md
|
18 |
+
|
19 |
+
content = None
|
20 |
+
headers = {
|
21 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
22 |
+
}
|
23 |
+
response = requests.get(url, timeout=30, headers=headers)
|
24 |
+
|
25 |
+
if (convert_to_markdown):
|
26 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
27 |
+
|
28 |
+
# remove script and style tags
|
29 |
+
for script in soup(["script", "style"]):
|
30 |
+
script.extract()
|
31 |
+
|
32 |
+
# for wikipedia only keep the main content
|
33 |
+
if "wikipedia.org" in url:
|
34 |
+
main_content = soup.find("main",{"id":"content"})
|
35 |
+
if main_content:
|
36 |
+
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
|
37 |
+
else:
|
38 |
+
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
|
39 |
+
else:
|
40 |
+
# Fallback for all other sites - from chatgpt - not tested
|
41 |
+
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
|
42 |
+
else:
|
43 |
+
content = response.text
|
44 |
+
|
45 |
+
return content
|
46 |
+
|
47 |
+
|
48 |
+
@tool
|
49 |
+
def read_file_tool(file_path: str) -> str:
|
50 |
+
"""
|
51 |
+
Tool to read a file and return its content.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
file_path (str): Path to the file to read.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
str: Content of the file or error message.
|
58 |
+
"""
|
59 |
+
try:
|
60 |
+
with open(file_path, "r") as file:
|
61 |
+
return file.read()
|
62 |
+
except Exception as e:
|
63 |
+
return f"Error reading file: {str(e)}"
|
64 |
+
|
65 |
+
|
66 |
+
@tool
|
67 |
+
def get_youtube_transcript(video_id: str) -> str:
|
68 |
+
"""
|
69 |
+
Fetches the transcript of a YouTube video given its video ID.
|
70 |
+
Args:
|
71 |
+
video_id (str): The ID of the YouTube video. Pass in the video ID, NOT the video URL. For a video with the URL https://www.youtube.com/watch?v=12345 the ID is 12345.
|
72 |
+
Returns:
|
73 |
+
str: The transcript of the YouTube video. as a single string with each line separated by a newline character.
|
74 |
+
"""
|
75 |
+
# Initialize the YouTubeTranscriptApi
|
76 |
+
ytt_api = YouTubeTranscriptApi()
|
77 |
+
fetched_transcript = ytt_api.fetch(video_id)
|
78 |
+
raw_data = fetched_transcript.to_raw_data()
|
79 |
+
# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines
|
80 |
+
transcript = "\n".join([item['text'] for item in raw_data])
|
81 |
+
return transcript
|
tools/__init__.py
ADDED
File without changes
|
tools/web_search.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from smolagents import tool
|
2 |
+
|
3 |
+
@tool
|
4 |
+
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
|
5 |
+
"""
|
6 |
+
Visits a website and fetches the content of a given URL / webpage.
|
7 |
+
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
|
8 |
+
Args:
|
9 |
+
url (str): The URL to fetch.
|
10 |
+
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
|
11 |
+
Returns:
|
12 |
+
str: The HTML content of the URL.
|
13 |
+
"""
|
14 |
+
import requests
|
15 |
+
from bs4 import BeautifulSoup
|
16 |
+
from markdownify import markdownify as md
|
17 |
+
|
18 |
+
content = None
|
19 |
+
headers = {
|
20 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
21 |
+
}
|
22 |
+
response = requests.get(url, timeout=30, headers=headers)
|
23 |
+
# print(response.text)
|
24 |
+
if response.text is not None:
|
25 |
+
print("not none")
|
26 |
+
if (convert_to_markdown):
|
27 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
28 |
+
# remove script and style tags
|
29 |
+
for script in soup(["script", "style"]):
|
30 |
+
script.extract()
|
31 |
+
|
32 |
+
# for wikipedia only keep the main content
|
33 |
+
if "wikipedia.org" in url:
|
34 |
+
main_content = soup.find("main",{"id":"content"})
|
35 |
+
if main_content:
|
36 |
+
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
|
37 |
+
else:
|
38 |
+
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
|
39 |
+
else:
|
40 |
+
# Fallback for all other sites - from chatgpt - not tested
|
41 |
+
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
|
42 |
+
else:
|
43 |
+
content = response.text
|
44 |
+
|
45 |
+
# save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
|
46 |
+
|
47 |
+
return content
|
utils/__init__.py
ADDED
File without changes
|
utils/fetch_file.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import os
|
3 |
+
|
4 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
5 |
+
files_folder = "gaia_files"
|
6 |
+
os.makedirs(files_folder, exist_ok=True)
|
7 |
+
|
8 |
+
def download_file_if_any(task_id: str) -> str:
|
9 |
+
url = f"{DEFAULT_API_URL}/files/{task_id}"
|
10 |
+
try:
|
11 |
+
response = requests.get(url)
|
12 |
+
response.raise_for_status()
|
13 |
+
|
14 |
+
content_disposition = response.headers.get("content-disposition")
|
15 |
+
filename = content_disposition.split("filename=")[1].strip('"') if content_disposition and "filename=" in content_disposition else None
|
16 |
+
|
17 |
+
if filename:
|
18 |
+
# path relative from app.py
|
19 |
+
file_path = os.path.join(files_folder, filename)
|
20 |
+
|
21 |
+
with open(file_path, "wb") as file:
|
22 |
+
file.write(response.content)
|
23 |
+
|
24 |
+
return str(file_path)
|
25 |
+
else:
|
26 |
+
return None
|
27 |
+
|
28 |
+
except requests.exceptions.RequestException as e:
|
29 |
+
print(f"Error making request: {e}")
|
30 |
+
return None
|
31 |
+
except ValueError as e:
|
32 |
+
print(f"Error decoding JSON response: {e}")
|
33 |
+
return None
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
print(download_file_if_any("f918266a-b3e0-4914-865d-4faa564f1aef"))
|