Omachoko
Enhanced GAIA agent: full API integration, advanced reasoning, expanded tools, and UI overhaul for 30%+ benchmark compliance
b56f671
raw
history blame
14 kB
#!/usr/bin/env python3
"""
๐Ÿš€ Enhanced GAIA Agent Interface - Full API Integration
Complete Gradio interface for GAIA benchmark with API connectivity and scoring
"""
import os
import gradio as gr
import json
from datetime import datetime
from gaia_agent import GAIAAgent
class GAIAInterface:
"""๐ŸŽฏ Enhanced GAIA Interface with Full API Integration"""
def __init__(self):
self.agent = GAIAAgent()
self.current_questions = []
self.answered_questions = []
self.score_history = []
def fetch_questions(self):
"""Fetch questions from GAIA API"""
try:
questions = self.agent.get_questions()
if questions:
self.current_questions = questions
return f"โœ… Fetched {len(questions)} questions from GAIA API"
else:
return "โŒ Failed to fetch questions from GAIA API"
except Exception as e:
return f"โŒ Error fetching questions: {str(e)}"
def get_random_question(self):
"""Get a random question from GAIA API"""
try:
question_data = self.agent.get_random_question()
if question_data:
task_id = question_data.get('task_id', 'unknown')
question = question_data.get('Question', 'No question found')
level = question_data.get('Level', 'Unknown')
files = question_data.get('file_name', None)
info = f"๐Ÿ“‹ **Task ID:** {task_id}\n"
info += f"๐ŸŽฏ **Level:** {level}\n"
if files:
info += f"๐Ÿ“ **Associated Files:** {files}\n"
info += f"โ“ **Question:** {question}"
return info, task_id, question
else:
return "โŒ Failed to fetch random question", "", ""
except Exception as e:
return f"โŒ Error: {str(e)}", "", ""
def process_question_with_files(self, question, task_id=None):
"""Process question with enhanced agent and file handling"""
if not question.strip():
return "Please enter a question or fetch one from GAIA API."
try:
# Use enhanced agent with task_id for file downloading
answer = self.agent.query(question, task_id=task_id, max_steps=15)
clean_answer = self.agent.clean_for_api_submission(answer)
# Store the answer for potential submission
if task_id:
self.answered_questions.append({
"task_id": task_id,
"question": question,
"submitted_answer": clean_answer,
"timestamp": datetime.now().isoformat()
})
return f"โœ… **Answer:** {clean_answer}\n\n๐Ÿง  **Reasoning Memory:**\n" + "\n".join(self.agent.reasoning_memory[-5:])
except Exception as e:
return f"โŒ Error: {str(e)}"
def submit_answers_for_scoring(self, username, agent_code_url):
"""Submit answers to GAIA API for scoring"""
if not username.strip():
return "โŒ Please provide your Hugging Face username"
if not agent_code_url.strip():
return "โŒ Please provide your agent code URL (Hugging Face Space)"
if not self.answered_questions:
return "โŒ No answered questions to submit. Please answer some questions first."
try:
# Prepare answers for submission
answers = [
{
"task_id": item["task_id"],
"submitted_answer": item["submitted_answer"]
}
for item in self.answered_questions
]
# Submit to GAIA API
result = self.agent.submit_answer(username, agent_code_url, answers)
if "error" not in result:
score = result.get("score", 0)
self.score_history.append({
"score": score,
"questions_answered": len(answers),
"timestamp": datetime.now().isoformat()
})
return f"โœ… **Submission Successful!**\n\n๐Ÿ“Š **Score:** {score}%\n๐ŸŽฏ **Questions Answered:** {len(answers)}\n\n๐Ÿ“ˆ **Result Details:**\n{json.dumps(result, indent=2)}"
else:
return f"โŒ **Submission Failed:** {result.get('error', 'Unknown error')}"
except Exception as e:
return f"โŒ Error submitting answers: {str(e)}"
def get_progress_stats(self):
"""Get current progress statistics"""
total_questions = len(self.current_questions)
answered_count = len(self.answered_questions)
if self.score_history:
latest_score = self.score_history[-1]["score"]
best_score = max(item["score"] for item in self.score_history)
else:
latest_score = 0
best_score = 0
stats = f"๐Ÿ“Š **Progress Statistics**\n\n"
stats += f"๐ŸŽฏ **Questions Available:** {total_questions}\n"
stats += f"โœ… **Questions Answered:** {answered_count}\n"
stats += f"๐Ÿ“ˆ **Latest Score:** {latest_score}%\n"
stats += f"๐Ÿ† **Best Score:** {best_score}%\n"
stats += f"๐ŸŽ–๏ธ **Target:** 30% (for certification)\n\n"
if latest_score >= 30:
stats += "๐ŸŽ‰ **Congratulations! You've achieved the target score for certification!**"
else:
remaining = 30 - latest_score
stats += f"๐Ÿ“ˆ **{remaining}% more needed for certification**"
return stats
def clear_session(self):
"""Clear current session data"""
self.answered_questions = []
return "โœ… Session cleared. Ready for new questions."
# Initialize interface
interface = GAIAInterface()
# Enhanced Gradio Interface
with gr.Blocks(title="๐Ÿš€ Enhanced GAIA Agent - Full API Integration", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ๐Ÿš€ Enhanced GAIA Agent - Complete GAIA Benchmark Implementation
**๐ŸŽฏ Target: 30%+ Performance for Course Certification**
## ๐ŸŒŸ Key Features:
- **๐Ÿ”— Full GAIA API Integration** - Fetch real questions and submit for scoring
- **๐Ÿ“ File Processing** - Automatic download and analysis of task files
- **๐Ÿง  Enhanced Multi-Step Reasoning** - Advanced tool orchestration
- **๐Ÿ“Š Real-time Progress Tracking** - Monitor your performance
- **๐Ÿ† Leaderboard Submission** - Submit scores to student leaderboard
""")
with gr.Tabs():
# Tab 1: GAIA Question Processing
with gr.TabItem("๐ŸŽฏ GAIA Questions"):
gr.Markdown("### Fetch and Process Real GAIA Benchmark Questions")
with gr.Row():
with gr.Column(scale=1):
fetch_btn = gr.Button("๐Ÿ”„ Fetch Questions from API", variant="secondary")
random_question_btn = gr.Button("๐ŸŽฒ Get Random Question", variant="primary")
fetch_status = gr.Textbox(label="๐Ÿ“ก API Status", interactive=False)
with gr.Column(scale=2):
question_info = gr.Markdown("Click 'Get Random Question' to fetch a GAIA question")
with gr.Row():
current_task_id = gr.Textbox(label="๐Ÿ†” Task ID", interactive=False)
question_input = gr.Textbox(
label="โ“ GAIA Question",
placeholder="Question will appear here when fetched from API",
lines=3
)
with gr.Row():
process_btn = gr.Button("๐Ÿค– Process with Enhanced Agent", variant="primary", size="lg")
with gr.Row():
answer_output = gr.Textbox(
label="๐Ÿง  Agent Response (with Enhanced Reasoning)",
lines=10,
interactive=False
)
# Tab 2: Manual Question Input
with gr.TabItem("โœ๏ธ Manual Input"):
gr.Markdown("### Test Agent with Custom Questions")
manual_question = gr.Textbox(
label="โ“ Your Question",
placeholder="Enter any question to test the agent...",
lines=3
)
manual_process_btn = gr.Button("๐Ÿค– Process Question", variant="primary")
manual_output = gr.Textbox(
label="๐Ÿง  Agent Response",
lines=8,
interactive=False
)
# Example questions
gr.Examples(
examples=[
"What is 25 + 37?",
"What is the capital of Germany?",
"If there are 8 planets and 4 are gas giants, how many are not gas giants?",
"Who was the US president when the Berlin Wall fell?",
"List the fruits in the painting in clockwise order starting from 12 o'clock",
"Convert 100 degrees Celsius to Fahrenheit"
],
inputs=[manual_question],
label="๐ŸŽฏ Example Questions (Different Complexity Levels)"
)
# Tab 3: Submission & Scoring
with gr.TabItem("๐Ÿ“Š Submission & Scoring"):
gr.Markdown("### Submit Answers for Official GAIA Scoring")
with gr.Row():
username_input = gr.Textbox(
label="๐Ÿ‘ค Hugging Face Username",
placeholder="Your HF username for leaderboard"
)
agent_code_input = gr.Textbox(
label="๐Ÿ”— Agent Code URL",
placeholder="https://huggingface.co/spaces/your-username/your-space/tree/main"
)
submit_btn = gr.Button("๐Ÿš€ Submit for Official Scoring", variant="primary", size="lg")
submission_result = gr.Textbox(
label="๐Ÿ“Š Submission Results",
lines=8,
interactive=False
)
with gr.Row():
progress_btn = gr.Button("๐Ÿ“ˆ View Progress", variant="secondary")
clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Session", variant="secondary")
progress_display = gr.Markdown("Click 'View Progress' to see your statistics")
# Tab 4: Agent Capabilities
with gr.TabItem("๐Ÿ› ๏ธ Agent Details"):
gr.Markdown("""
### ๐Ÿง  Enhanced Agent Capabilities
#### ๐Ÿ”ง **Tool Arsenal** (9 Enhanced Tools):
1. **๐Ÿงฎ Enhanced Calculator** - Complex mathematical operations and multi-step calculations
2. **๐ŸŒ Enhanced Web Search** - Expanded knowledge base with 20+ countries, astronomy, history
3. **๐Ÿ–ผ๏ธ Image Analyzer** - Simulated visual content processing and spatial reasoning
4. **๐Ÿ“„ Document Reader** - File content extraction and analysis
5. **๐Ÿ“ File Processor** - Download and process GAIA task files (TXT, JSON, CSV)
6. **๐Ÿ“… Date Calculator** - Temporal reasoning and age calculations
7. **๐Ÿ”„ Unit Converter** - Length, temperature, and weight conversions
8. **๐Ÿ“ Text Analyzer** - Content analysis and pattern extraction
9. **๐Ÿง  Reasoning Chain** - Multi-step logical synthesis
#### ๐ŸŽฏ **GAIA Compliance Features**:
- **Level 1**: Basic questions (<5 steps) โœ…
- **Level 2**: Multi-step reasoning (5-10 steps) โœ…
- **Level 3**: Complex long-term planning โœ…
- **File Processing**: Automatic download and analysis โœ…
- **API Integration**: Full GAIA benchmark connectivity โœ…
- **Clean Formatting**: Exact match answer preparation โœ…
#### ๐Ÿ“Š **Performance Targets**:
- **Minimum Required**: 30% accuracy for certification
- **Current Baseline**: GPT-4 with plugins ~15%
- **Enhanced Target**: 35-45% with optimized knowledge base
- **Human Performance**: ~92% (reference point)
#### ๐Ÿง  **Enhanced Knowledge Base**:
- **Geography**: 20+ countries and capitals
- **Astronomy**: Solar system facts, planet classifications
- **History**: Key events with dates and figures
- **Mathematics**: Constants and conversion factors
- **Arts**: Famous paintings and artists
""")
# Event handlers
fetch_btn.click(
fn=interface.fetch_questions,
outputs=[fetch_status]
)
random_question_btn.click(
fn=interface.get_random_question,
outputs=[question_info, current_task_id, question_input]
)
process_btn.click(
fn=lambda q, t: interface.process_question_with_files(q, t),
inputs=[question_input, current_task_id],
outputs=[answer_output]
)
manual_process_btn.click(
fn=lambda q: interface.process_question_with_files(q),
inputs=[manual_question],
outputs=[manual_output]
)
submit_btn.click(
fn=interface.submit_answers_for_scoring,
inputs=[username_input, agent_code_input],
outputs=[submission_result]
)
progress_btn.click(
fn=interface.get_progress_stats,
outputs=[progress_display]
)
clear_btn.click(
fn=interface.clear_session,
outputs=[submission_result]
)
if __name__ == "__main__":
demo.launch(
debug=False,
share=True,
server_name="0.0.0.0",
server_port=7860
)