Spaces:
Runtime error
Runtime error
Omachoko
Enhanced GAIA agent: full API integration, advanced reasoning, expanded tools, and UI overhaul for 30%+ benchmark compliance
b56f671
#!/usr/bin/env python3 | |
""" | |
๐ Enhanced GAIA Agent Interface - Full API Integration | |
Complete Gradio interface for GAIA benchmark with API connectivity and scoring | |
""" | |
import os | |
import gradio as gr | |
import json | |
from datetime import datetime | |
from gaia_agent import GAIAAgent | |
class GAIAInterface: | |
"""๐ฏ Enhanced GAIA Interface with Full API Integration""" | |
def __init__(self): | |
self.agent = GAIAAgent() | |
self.current_questions = [] | |
self.answered_questions = [] | |
self.score_history = [] | |
def fetch_questions(self): | |
"""Fetch questions from GAIA API""" | |
try: | |
questions = self.agent.get_questions() | |
if questions: | |
self.current_questions = questions | |
return f"โ Fetched {len(questions)} questions from GAIA API" | |
else: | |
return "โ Failed to fetch questions from GAIA API" | |
except Exception as e: | |
return f"โ Error fetching questions: {str(e)}" | |
def get_random_question(self): | |
"""Get a random question from GAIA API""" | |
try: | |
question_data = self.agent.get_random_question() | |
if question_data: | |
task_id = question_data.get('task_id', 'unknown') | |
question = question_data.get('Question', 'No question found') | |
level = question_data.get('Level', 'Unknown') | |
files = question_data.get('file_name', None) | |
info = f"๐ **Task ID:** {task_id}\n" | |
info += f"๐ฏ **Level:** {level}\n" | |
if files: | |
info += f"๐ **Associated Files:** {files}\n" | |
info += f"โ **Question:** {question}" | |
return info, task_id, question | |
else: | |
return "โ Failed to fetch random question", "", "" | |
except Exception as e: | |
return f"โ Error: {str(e)}", "", "" | |
def process_question_with_files(self, question, task_id=None): | |
"""Process question with enhanced agent and file handling""" | |
if not question.strip(): | |
return "Please enter a question or fetch one from GAIA API." | |
try: | |
# Use enhanced agent with task_id for file downloading | |
answer = self.agent.query(question, task_id=task_id, max_steps=15) | |
clean_answer = self.agent.clean_for_api_submission(answer) | |
# Store the answer for potential submission | |
if task_id: | |
self.answered_questions.append({ | |
"task_id": task_id, | |
"question": question, | |
"submitted_answer": clean_answer, | |
"timestamp": datetime.now().isoformat() | |
}) | |
return f"โ **Answer:** {clean_answer}\n\n๐ง **Reasoning Memory:**\n" + "\n".join(self.agent.reasoning_memory[-5:]) | |
except Exception as e: | |
return f"โ Error: {str(e)}" | |
def submit_answers_for_scoring(self, username, agent_code_url): | |
"""Submit answers to GAIA API for scoring""" | |
if not username.strip(): | |
return "โ Please provide your Hugging Face username" | |
if not agent_code_url.strip(): | |
return "โ Please provide your agent code URL (Hugging Face Space)" | |
if not self.answered_questions: | |
return "โ No answered questions to submit. Please answer some questions first." | |
try: | |
# Prepare answers for submission | |
answers = [ | |
{ | |
"task_id": item["task_id"], | |
"submitted_answer": item["submitted_answer"] | |
} | |
for item in self.answered_questions | |
] | |
# Submit to GAIA API | |
result = self.agent.submit_answer(username, agent_code_url, answers) | |
if "error" not in result: | |
score = result.get("score", 0) | |
self.score_history.append({ | |
"score": score, | |
"questions_answered": len(answers), | |
"timestamp": datetime.now().isoformat() | |
}) | |
return f"โ **Submission Successful!**\n\n๐ **Score:** {score}%\n๐ฏ **Questions Answered:** {len(answers)}\n\n๐ **Result Details:**\n{json.dumps(result, indent=2)}" | |
else: | |
return f"โ **Submission Failed:** {result.get('error', 'Unknown error')}" | |
except Exception as e: | |
return f"โ Error submitting answers: {str(e)}" | |
def get_progress_stats(self): | |
"""Get current progress statistics""" | |
total_questions = len(self.current_questions) | |
answered_count = len(self.answered_questions) | |
if self.score_history: | |
latest_score = self.score_history[-1]["score"] | |
best_score = max(item["score"] for item in self.score_history) | |
else: | |
latest_score = 0 | |
best_score = 0 | |
stats = f"๐ **Progress Statistics**\n\n" | |
stats += f"๐ฏ **Questions Available:** {total_questions}\n" | |
stats += f"โ **Questions Answered:** {answered_count}\n" | |
stats += f"๐ **Latest Score:** {latest_score}%\n" | |
stats += f"๐ **Best Score:** {best_score}%\n" | |
stats += f"๐๏ธ **Target:** 30% (for certification)\n\n" | |
if latest_score >= 30: | |
stats += "๐ **Congratulations! You've achieved the target score for certification!**" | |
else: | |
remaining = 30 - latest_score | |
stats += f"๐ **{remaining}% more needed for certification**" | |
return stats | |
def clear_session(self): | |
"""Clear current session data""" | |
self.answered_questions = [] | |
return "โ Session cleared. Ready for new questions." | |
# Initialize interface | |
interface = GAIAInterface() | |
# Enhanced Gradio Interface | |
with gr.Blocks(title="๐ Enhanced GAIA Agent - Full API Integration", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# ๐ Enhanced GAIA Agent - Complete GAIA Benchmark Implementation | |
**๐ฏ Target: 30%+ Performance for Course Certification** | |
## ๐ Key Features: | |
- **๐ Full GAIA API Integration** - Fetch real questions and submit for scoring | |
- **๐ File Processing** - Automatic download and analysis of task files | |
- **๐ง Enhanced Multi-Step Reasoning** - Advanced tool orchestration | |
- **๐ Real-time Progress Tracking** - Monitor your performance | |
- **๐ Leaderboard Submission** - Submit scores to student leaderboard | |
""") | |
with gr.Tabs(): | |
# Tab 1: GAIA Question Processing | |
with gr.TabItem("๐ฏ GAIA Questions"): | |
gr.Markdown("### Fetch and Process Real GAIA Benchmark Questions") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
fetch_btn = gr.Button("๐ Fetch Questions from API", variant="secondary") | |
random_question_btn = gr.Button("๐ฒ Get Random Question", variant="primary") | |
fetch_status = gr.Textbox(label="๐ก API Status", interactive=False) | |
with gr.Column(scale=2): | |
question_info = gr.Markdown("Click 'Get Random Question' to fetch a GAIA question") | |
with gr.Row(): | |
current_task_id = gr.Textbox(label="๐ Task ID", interactive=False) | |
question_input = gr.Textbox( | |
label="โ GAIA Question", | |
placeholder="Question will appear here when fetched from API", | |
lines=3 | |
) | |
with gr.Row(): | |
process_btn = gr.Button("๐ค Process with Enhanced Agent", variant="primary", size="lg") | |
with gr.Row(): | |
answer_output = gr.Textbox( | |
label="๐ง Agent Response (with Enhanced Reasoning)", | |
lines=10, | |
interactive=False | |
) | |
# Tab 2: Manual Question Input | |
with gr.TabItem("โ๏ธ Manual Input"): | |
gr.Markdown("### Test Agent with Custom Questions") | |
manual_question = gr.Textbox( | |
label="โ Your Question", | |
placeholder="Enter any question to test the agent...", | |
lines=3 | |
) | |
manual_process_btn = gr.Button("๐ค Process Question", variant="primary") | |
manual_output = gr.Textbox( | |
label="๐ง Agent Response", | |
lines=8, | |
interactive=False | |
) | |
# Example questions | |
gr.Examples( | |
examples=[ | |
"What is 25 + 37?", | |
"What is the capital of Germany?", | |
"If there are 8 planets and 4 are gas giants, how many are not gas giants?", | |
"Who was the US president when the Berlin Wall fell?", | |
"List the fruits in the painting in clockwise order starting from 12 o'clock", | |
"Convert 100 degrees Celsius to Fahrenheit" | |
], | |
inputs=[manual_question], | |
label="๐ฏ Example Questions (Different Complexity Levels)" | |
) | |
# Tab 3: Submission & Scoring | |
with gr.TabItem("๐ Submission & Scoring"): | |
gr.Markdown("### Submit Answers for Official GAIA Scoring") | |
with gr.Row(): | |
username_input = gr.Textbox( | |
label="๐ค Hugging Face Username", | |
placeholder="Your HF username for leaderboard" | |
) | |
agent_code_input = gr.Textbox( | |
label="๐ Agent Code URL", | |
placeholder="https://huggingface.co/spaces/your-username/your-space/tree/main" | |
) | |
submit_btn = gr.Button("๐ Submit for Official Scoring", variant="primary", size="lg") | |
submission_result = gr.Textbox( | |
label="๐ Submission Results", | |
lines=8, | |
interactive=False | |
) | |
with gr.Row(): | |
progress_btn = gr.Button("๐ View Progress", variant="secondary") | |
clear_btn = gr.Button("๐๏ธ Clear Session", variant="secondary") | |
progress_display = gr.Markdown("Click 'View Progress' to see your statistics") | |
# Tab 4: Agent Capabilities | |
with gr.TabItem("๐ ๏ธ Agent Details"): | |
gr.Markdown(""" | |
### ๐ง Enhanced Agent Capabilities | |
#### ๐ง **Tool Arsenal** (9 Enhanced Tools): | |
1. **๐งฎ Enhanced Calculator** - Complex mathematical operations and multi-step calculations | |
2. **๐ Enhanced Web Search** - Expanded knowledge base with 20+ countries, astronomy, history | |
3. **๐ผ๏ธ Image Analyzer** - Simulated visual content processing and spatial reasoning | |
4. **๐ Document Reader** - File content extraction and analysis | |
5. **๐ File Processor** - Download and process GAIA task files (TXT, JSON, CSV) | |
6. **๐ Date Calculator** - Temporal reasoning and age calculations | |
7. **๐ Unit Converter** - Length, temperature, and weight conversions | |
8. **๐ Text Analyzer** - Content analysis and pattern extraction | |
9. **๐ง Reasoning Chain** - Multi-step logical synthesis | |
#### ๐ฏ **GAIA Compliance Features**: | |
- **Level 1**: Basic questions (<5 steps) โ | |
- **Level 2**: Multi-step reasoning (5-10 steps) โ | |
- **Level 3**: Complex long-term planning โ | |
- **File Processing**: Automatic download and analysis โ | |
- **API Integration**: Full GAIA benchmark connectivity โ | |
- **Clean Formatting**: Exact match answer preparation โ | |
#### ๐ **Performance Targets**: | |
- **Minimum Required**: 30% accuracy for certification | |
- **Current Baseline**: GPT-4 with plugins ~15% | |
- **Enhanced Target**: 35-45% with optimized knowledge base | |
- **Human Performance**: ~92% (reference point) | |
#### ๐ง **Enhanced Knowledge Base**: | |
- **Geography**: 20+ countries and capitals | |
- **Astronomy**: Solar system facts, planet classifications | |
- **History**: Key events with dates and figures | |
- **Mathematics**: Constants and conversion factors | |
- **Arts**: Famous paintings and artists | |
""") | |
# Event handlers | |
fetch_btn.click( | |
fn=interface.fetch_questions, | |
outputs=[fetch_status] | |
) | |
random_question_btn.click( | |
fn=interface.get_random_question, | |
outputs=[question_info, current_task_id, question_input] | |
) | |
process_btn.click( | |
fn=lambda q, t: interface.process_question_with_files(q, t), | |
inputs=[question_input, current_task_id], | |
outputs=[answer_output] | |
) | |
manual_process_btn.click( | |
fn=lambda q: interface.process_question_with_files(q), | |
inputs=[manual_question], | |
outputs=[manual_output] | |
) | |
submit_btn.click( | |
fn=interface.submit_answers_for_scoring, | |
inputs=[username_input, agent_code_input], | |
outputs=[submission_result] | |
) | |
progress_btn.click( | |
fn=interface.get_progress_stats, | |
outputs=[progress_display] | |
) | |
clear_btn.click( | |
fn=interface.clear_session, | |
outputs=[submission_result] | |
) | |
if __name__ == "__main__": | |
demo.launch( | |
debug=False, | |
share=True, | |
server_name="0.0.0.0", | |
server_port=7860 | |
) | |