GAIA_AGE / app.py
ghost
Updated GAIA agent for submission
945d0d0
raw
history blame
4.18 kB
import gradio as gr
import json
import os
from datetime import datetime
from agent import GAIAAgent
from evaluate import evaluate_agent, create_sample_dataset
import traceback
def run_evaluation():
"""Run the GAIA evaluation and return results."""
try:
print("Starting GAIA Agent Evaluation...")
print("=" * 50)
# Initialize agent
agent = GAIAAgent()
# Test API connection first
print("Testing xAI API connection...")
test_response = agent.test_grok()
print(f"API Test Response: {test_response}")
# Run evaluation on sample dataset (since we don't have the full GAIA dataset)
print("\nRunning evaluation on sample tasks...")
score = evaluate_agent(dataset_path=None, max_tasks=10)
# Read submission file if it exists
submission_content = ""
if os.path.exists("submission.jsonl"):
with open("submission.jsonl", "r") as f:
submission_content = f.read()
# Format results
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
results = f"""
# GAIA Agent Evaluation Results
**Timestamp:** {timestamp}
**Final Score:** {score:.2f}%
**Certificate Status:** {'βœ… ACHIEVED (β‰₯30%)' if score >= 30 else '❌ NOT ACHIEVED (<30%)'}
## API Connection Status
{test_response}
## Submission File Preview
```json
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''}
```
## Next Steps
{'πŸŽ‰ Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πŸ’ͺ Keep improving your agent to reach the 30% threshold.'}
"""
return results, score
except Exception as e:
error_msg = f"""
# Evaluation Error
**Error:** {str(e)}
**Traceback:**
```
{traceback.format_exc()}
```
Please check the logs and fix any issues before retrying.
"""
return error_msg, 0.0
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ€– GAIA Agent Evaluation
This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence.
**Goal:** Achieve β‰₯30% score on GAIA benchmark tasks
Click the button below to run the evaluation and submit your answers.
⚠️ **Note:** This may take several minutes to complete. Please be patient.
""")
with gr.Row():
run_btn = gr.Button("πŸš€ Run Evaluation & Submit All Answers", variant="primary", size="lg")
with gr.Row():
with gr.Column():
gr.Markdown("## Run Status / Submission Result")
results_output = gr.Markdown("Click the button above to start evaluation...")
with gr.Column():
gr.Markdown("## Score")
score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False)
# Event handler
run_btn.click(
fn=run_evaluation,
inputs=[],
outputs=[results_output, score_output],
show_progress=True
)
gr.Markdown("""
---
## About This Agent
- **API:** xAI Grok for reasoning
- **Tools:** Web search, file handling, math calculations
- **Fallbacks:** Local knowledge for common questions
- **Target:** 30% accuracy for certificate eligibility
## Troubleshooting
If you encounter issues:
1. Check the container logs in the "Logs" tab
2. Verify API credentials and internet connectivity
3. Ensure all dependencies are installed
**Good luck! πŸ€**
""")
return demo
if __name__ == "__main__":
# Create and launch the interface
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
show_api=False
)