Spaces:

schoolkithub
/

multi-agent-gaia-system

Runtime error

Omachoko

Enhanced GAIA agent: full API integration, advanced reasoning, expanded tools, and UI overhaul for 30%+ benchmark compliance

b56f671 13 days ago

raw

history blame

14 kB

	#!/usr/bin/env python3
	"""
	🚀 Enhanced GAIA Agent Interface - Full API Integration
	Complete Gradio interface for GAIA benchmark with API connectivity and scoring
	"""

	import os
	import gradio as gr
	import json
	from datetime import datetime
	from gaia_agent import GAIAAgent

	class GAIAInterface:
	"""🎯 Enhanced GAIA Interface with Full API Integration"""

	def __init__(self):
	self.agent = GAIAAgent()
	self.current_questions = []
	self.answered_questions = []
	self.score_history = []

	def fetch_questions(self):
	"""Fetch questions from GAIA API"""
	try:
	questions = self.agent.get_questions()
	if questions:
	self.current_questions = questions
	return f"✅ Fetched {len(questions)} questions from GAIA API"
	else:
	return "❌ Failed to fetch questions from GAIA API"
	except Exception as e:
	return f"❌ Error fetching questions: {str(e)}"

	def get_random_question(self):
	"""Get a random question from GAIA API"""
	try:
	question_data = self.agent.get_random_question()
	if question_data:
	task_id = question_data.get('task_id', 'unknown')
	question = question_data.get('Question', 'No question found')
	level = question_data.get('Level', 'Unknown')
	files = question_data.get('file_name', None)

	info = f"📋 Task ID: {task_id}\n"
	info += f"🎯 Level: {level}\n"
	if files:
	info += f"📁 Associated Files: {files}\n"
	info += f"❓ Question: {question}"

	return info, task_id, question
	else:
	return "❌ Failed to fetch random question", "", ""
	except Exception as e:
	return f"❌ Error: {str(e)}", "", ""

	def process_question_with_files(self, question, task_id=None):
	"""Process question with enhanced agent and file handling"""
	if not question.strip():
	return "Please enter a question or fetch one from GAIA API."

	try:
	# Use enhanced agent with task_id for file downloading
	answer = self.agent.query(question, task_id=task_id, max_steps=15)
	clean_answer = self.agent.clean_for_api_submission(answer)

	# Store the answer for potential submission
	if task_id:
	self.answered_questions.append({
	"task_id": task_id,
	"question": question,
	"submitted_answer": clean_answer,
	"timestamp": datetime.now().isoformat()
	})

	return f"✅ Answer: {clean_answer}\n\n🧠 Reasoning Memory:\n" + "\n".join(self.agent.reasoning_memory[-5:])
	except Exception as e:
	return f"❌ Error: {str(e)}"

	def submit_answers_for_scoring(self, username, agent_code_url):
	"""Submit answers to GAIA API for scoring"""
	if not username.strip():
	return "❌ Please provide your Hugging Face username"

	if not agent_code_url.strip():
	return "❌ Please provide your agent code URL (Hugging Face Space)"

	if not self.answered_questions:
	return "❌ No answered questions to submit. Please answer some questions first."

	try:
	# Prepare answers for submission
	answers = [
	{
	"task_id": item["task_id"],
	"submitted_answer": item["submitted_answer"]
	}
	for item in self.answered_questions
	]

	# Submit to GAIA API
	result = self.agent.submit_answer(username, agent_code_url, answers)

	if "error" not in result:
	score = result.get("score", 0)
	self.score_history.append({
	"score": score,
	"questions_answered": len(answers),
	"timestamp": datetime.now().isoformat()
	})

	return f"✅ Submission Successful!\n\n📊 Score: {score}%\n🎯 Questions Answered: {len(answers)}\n\n📈 Result Details:\n{json.dumps(result, indent=2)}"
	else:
	return f"❌ Submission Failed: {result.get('error', 'Unknown error')}"

	except Exception as e:
	return f"❌ Error submitting answers: {str(e)}"

	def get_progress_stats(self):
	"""Get current progress statistics"""
	total_questions = len(self.current_questions)
	answered_count = len(self.answered_questions)

	if self.score_history:
	latest_score = self.score_history[-1]["score"]
	best_score = max(item["score"] for item in self.score_history)
	else:
	latest_score = 0
	best_score = 0

	stats = f"📊 Progress Statistics\n\n"
	stats += f"🎯 Questions Available: {total_questions}\n"
	stats += f"✅ Questions Answered: {answered_count}\n"
	stats += f"📈 Latest Score: {latest_score}%\n"
	stats += f"🏆 Best Score: {best_score}%\n"
	stats += f"🎖️ Target: 30% (for certification)\n\n"

	if latest_score >= 30:
	stats += "🎉 Congratulations! You've achieved the target score for certification!"
	else:
	remaining = 30 - latest_score
	stats += f"📈 {remaining}% more needed for certification"

	return stats

	def clear_session(self):
	"""Clear current session data"""
	self.answered_questions = []
	return "✅ Session cleared. Ready for new questions."

	# Initialize interface
	interface = GAIAInterface()

	# Enhanced Gradio Interface
	with gr.Blocks(title="🚀 Enhanced GAIA Agent - Full API Integration", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🚀 Enhanced GAIA Agent - Complete GAIA Benchmark Implementation

	🎯 Target: 30%+ Performance for Course Certification

	## 🌟 Key Features:
	- 🔗 Full GAIA API Integration - Fetch real questions and submit for scoring
	- 📁 File Processing - Automatic download and analysis of task files
	- 🧠 Enhanced Multi-Step Reasoning - Advanced tool orchestration
	- 📊 Real-time Progress Tracking - Monitor your performance
	- 🏆 Leaderboard Submission - Submit scores to student leaderboard
	""")

	with gr.Tabs():
	# Tab 1: GAIA Question Processing
	with gr.TabItem("🎯 GAIA Questions"):
	gr.Markdown("### Fetch and Process Real GAIA Benchmark Questions")

	with gr.Row():
	with gr.Column(scale=1):
	fetch_btn = gr.Button("🔄 Fetch Questions from API", variant="secondary")
	random_question_btn = gr.Button("🎲 Get Random Question", variant="primary")
	fetch_status = gr.Textbox(label="📡 API Status", interactive=False)

	with gr.Column(scale=2):
	question_info = gr.Markdown("Click 'Get Random Question' to fetch a GAIA question")

	with gr.Row():
	current_task_id = gr.Textbox(label="🆔 Task ID", interactive=False)
	question_input = gr.Textbox(
	label="❓ GAIA Question",
	placeholder="Question will appear here when fetched from API",
	lines=3
	)

	with gr.Row():
	process_btn = gr.Button("🤖 Process with Enhanced Agent", variant="primary", size="lg")

	with gr.Row():
	answer_output = gr.Textbox(
	label="🧠 Agent Response (with Enhanced Reasoning)",
	lines=10,
	interactive=False
	)

	# Tab 2: Manual Question Input
	with gr.TabItem("✏️ Manual Input"):
	gr.Markdown("### Test Agent with Custom Questions")

	manual_question = gr.Textbox(
	label="❓ Your Question",
	placeholder="Enter any question to test the agent...",
	lines=3
	)

	manual_process_btn = gr.Button("🤖 Process Question", variant="primary")
	manual_output = gr.Textbox(
	label="🧠 Agent Response",
	lines=8,
	interactive=False
	)

	# Example questions
	gr.Examples(
	examples=[
	"What is 25 + 37?",
	"What is the capital of Germany?",
	"If there are 8 planets and 4 are gas giants, how many are not gas giants?",
	"Who was the US president when the Berlin Wall fell?",
	"List the fruits in the painting in clockwise order starting from 12 o'clock",
	"Convert 100 degrees Celsius to Fahrenheit"
	],
	inputs=[manual_question],
	label="🎯 Example Questions (Different Complexity Levels)"
	)

	# Tab 3: Submission & Scoring
	with gr.TabItem("📊 Submission & Scoring"):
	gr.Markdown("### Submit Answers for Official GAIA Scoring")

	with gr.Row():
	username_input = gr.Textbox(
	label="👤 Hugging Face Username",
	placeholder="Your HF username for leaderboard"
	)
	agent_code_input = gr.Textbox(
	label="🔗 Agent Code URL",
	placeholder="https://huggingface.co/spaces/your-username/your-space/tree/main"
	)

	submit_btn = gr.Button("🚀 Submit for Official Scoring", variant="primary", size="lg")
	submission_result = gr.Textbox(
	label="📊 Submission Results",
	lines=8,
	interactive=False
	)

	with gr.Row():
	progress_btn = gr.Button("📈 View Progress", variant="secondary")
	clear_btn = gr.Button("🗑️ Clear Session", variant="secondary")

	progress_display = gr.Markdown("Click 'View Progress' to see your statistics")

	# Tab 4: Agent Capabilities
	with gr.TabItem("🛠️ Agent Details"):
	gr.Markdown("""
	### 🧠 Enhanced Agent Capabilities

	#### 🔧 Tool Arsenal (9 Enhanced Tools):
	1. 🧮 Enhanced Calculator - Complex mathematical operations and multi-step calculations
	2. 🌐 Enhanced Web Search - Expanded knowledge base with 20+ countries, astronomy, history
	3. 🖼️ Image Analyzer - Simulated visual content processing and spatial reasoning
	4. 📄 Document Reader - File content extraction and analysis
	5. 📁 File Processor - Download and process GAIA task files (TXT, JSON, CSV)
	6. 📅 Date Calculator - Temporal reasoning and age calculations
	7. 🔄 Unit Converter - Length, temperature, and weight conversions
	8. 📝 Text Analyzer - Content analysis and pattern extraction
	9. 🧠 Reasoning Chain - Multi-step logical synthesis

	#### 🎯 GAIA Compliance Features:
	- Level 1: Basic questions (<5 steps) ✅
	- Level 2: Multi-step reasoning (5-10 steps) ✅
	- Level 3: Complex long-term planning ✅
	- File Processing: Automatic download and analysis ✅
	- API Integration: Full GAIA benchmark connectivity ✅
	- Clean Formatting: Exact match answer preparation ✅

	#### 📊 Performance Targets:
	- Minimum Required: 30% accuracy for certification
	- Current Baseline: GPT-4 with plugins ~15%
	- Enhanced Target: 35-45% with optimized knowledge base
	- Human Performance: ~92% (reference point)

	#### 🧠 Enhanced Knowledge Base:
	- Geography: 20+ countries and capitals
	- Astronomy: Solar system facts, planet classifications
	- History: Key events with dates and figures
	- Mathematics: Constants and conversion factors
	- Arts: Famous paintings and artists
	""")

	# Event handlers
	fetch_btn.click(
	fn=interface.fetch_questions,
	outputs=[fetch_status]
	)

	random_question_btn.click(
	fn=interface.get_random_question,
	outputs=[question_info, current_task_id, question_input]
	)

	process_btn.click(
	fn=lambda q, t: interface.process_question_with_files(q, t),
	inputs=[question_input, current_task_id],
	outputs=[answer_output]
	)

	manual_process_btn.click(
	fn=lambda q: interface.process_question_with_files(q),
	inputs=[manual_question],
	outputs=[manual_output]
	)

	submit_btn.click(
	fn=interface.submit_answers_for_scoring,
	inputs=[username_input, agent_code_input],
	outputs=[submission_result]
	)

	progress_btn.click(
	fn=interface.get_progress_stats,
	outputs=[progress_display]
	)

	clear_btn.click(
	fn=interface.clear_session,
	outputs=[submission_result]
	)

	if __name__ == "__main__":
	demo.launch(
	debug=False,
	share=True,
	server_name="0.0.0.0",
	server_port=7860
	)