sirine1712 commited on
Commit
012ef3f
·
verified ·
1 Parent(s): 30b3077

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -468
app.py CHANGED
@@ -5,518 +5,178 @@ import pandas as pd
5
  from smolagents import ToolCallingAgent, tool
6
  from duckduckgo_search import DDGS
7
  import math
8
- import openai
9
  import re
10
- import json
11
- from datetime import datetime, timedelta
12
- import time
13
 
14
  # --- Enhanced Tools ---
15
  @tool
16
- def duck_search(query: str) -> str:
17
- """
18
- Searches the web using DuckDuckGo and returns detailed information.
19
-
20
- Args:
21
- query: The search query string.
22
-
23
- Returns:
24
- A string with comprehensive search results including titles, snippets, and URLs.
25
- """
26
  try:
27
  with DDGS() as ddgs:
28
- results = ddgs.text(query, max_results=5) # Increased results
29
- if not results:
30
- return "No results found."
31
-
32
- formatted_results = []
33
- for i, r in enumerate(results, 1):
34
- formatted_results.append(
35
- f"Result {i}:\n"
36
- f"Title: {r['title']}\n"
37
- f"Content: {r['body']}\n"
38
- f"URL: {r['href']}\n"
39
- f"---"
40
  )
41
- return "\n".join(formatted_results)
 
42
  except Exception as e:
43
  return f"Search error: {e}"
44
 
45
  @tool
46
- def focused_search(query: str, topic: str = "") -> str:
47
- """
48
- Performs a more focused search with specific keywords for better results.
49
-
50
- Args:
51
- query: The main search query
52
- topic: Additional topic context to improve search accuracy
53
-
54
- Returns:
55
- Focused search results
56
- """
57
  try:
58
- # Enhance query with topic context
59
- enhanced_query = f"{query} {topic}".strip()
60
-
61
- with DDGS() as ddgs:
62
- results = ddgs.text(enhanced_query, max_results=3)
63
- if not results:
64
- # Try alternative search if no results
65
- results = ddgs.text(query, max_results=3)
66
-
67
- if not results:
68
- return "No results found for focused search."
69
-
70
- summaries = []
71
- for r in results:
72
- summaries.append(f"**{r['title']}**\n{r['body']}\nSource: {r['href']}")
73
-
74
- return "\n\n".join(summaries)
75
  except Exception as e:
76
- return f"Focused search error: {e}"
77
 
78
  @tool
79
- def advanced_calculator(expression: str) -> str:
80
- """
81
- Enhanced calculator with support for complex mathematical operations.
82
-
83
- Args:
84
- expression: A mathematical expression or calculation
85
-
86
- Returns:
87
- The calculated result with detailed steps when possible
88
- """
89
- try:
90
- # Clean the expression
91
- expression = expression.strip()
92
-
93
- # Handle common mathematical functions and constants
94
- safe_dict = {
95
- "__builtins__": {},
96
- **math.__dict__,
97
- "abs": abs,
98
- "round": round,
99
- "min": min,
100
- "max": max,
101
- "sum": sum,
102
- "pow": pow,
103
- }
104
-
105
- # Try to evaluate the expression
106
- result = eval(expression, safe_dict)
107
-
108
- # Format the result nicely
109
- if isinstance(result, float):
110
- if result.is_integer():
111
- return str(int(result))
112
- else:
113
- return f"{result:.10g}" # Remove trailing zeros
114
-
115
- return str(result)
116
-
117
- except Exception as e:
118
- # Try to handle percentage calculations
119
- if "%" in expression:
120
- try:
121
- # Convert percentage expressions
122
- expr_mod = expression.replace("%", "/100")
123
- result = eval(expr_mod, safe_dict)
124
- return str(result)
125
- except:
126
- pass
127
-
128
- return f"Calculation error: {e}. Please check the mathematical expression."
129
 
130
  @tool
131
- def date_calculator(date_expression: str) -> str:
132
- """
133
- Calculates dates, time differences, and handles date-related queries.
134
-
135
- Args:
136
- date_expression: A date calculation or query
137
-
138
- Returns:
139
- The calculated date or time difference
140
- """
141
- try:
142
- current_date = datetime.now()
143
-
144
- # Handle relative date expressions
145
- if "days ago" in date_expression.lower():
146
- days_match = re.search(r'(\d+)\s*days?\s*ago', date_expression.lower())
147
- if days_match:
148
- days = int(days_match.group(1))
149
- target_date = current_date - timedelta(days=days)
150
- return target_date.strftime("%Y-%m-%d (%A)")
151
-
152
- elif "days from now" in date_expression.lower():
153
- days_match = re.search(r'(\d+)\s*days?\s*from\s*now', date_expression.lower())
154
- if days_match:
155
- days = int(days_match.group(1))
156
- target_date = current_date + timedelta(days=days)
157
- return target_date.strftime("%Y-%m-%d (%A)")
158
-
159
- elif "weeks ago" in date_expression.lower():
160
- weeks_match = re.search(r'(\d+)\s*weeks?\s*ago', date_expression.lower())
161
- if weeks_match:
162
- weeks = int(weeks_match.group(1))
163
- target_date = current_date - timedelta(weeks=weeks)
164
- return target_date.strftime("%Y-%m-%d (%A)")
165
-
166
- # Current date info
167
- elif "today" in date_expression.lower() or "current date" in date_expression.lower():
168
- return current_date.strftime("%Y-%m-%d (%A)")
169
-
170
- return f"Current date: {current_date.strftime('%Y-%m-%d (%A)')}"
171
-
172
- except Exception as e:
173
- return f"Date calculation error: {e}"
174
-
175
- @tool
176
- def text_analyzer(text: str) -> str:
177
- """
178
- Analyzes text for patterns, extracts information, and provides insights.
179
-
180
- Args:
181
- text: The text to analyze
182
-
183
- Returns:
184
- Analysis results including word count, patterns, and extracted information
185
- """
186
- try:
187
- if not text:
188
- return "No text provided for analysis."
189
-
190
- # Basic statistics
191
- word_count = len(text.split())
192
- char_count = len(text)
193
- sentence_count = len([s for s in text.split('.') if s.strip()])
194
-
195
- # Extract numbers
196
- numbers = re.findall(r'-?\d+(?:\.\d+)?', text)
197
-
198
- # Extract dates
199
- date_patterns = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', text)
200
-
201
- # Extract emails
202
- emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
203
-
204
- analysis = f"Text Analysis:\n"
205
- analysis += f"- Words: {word_count}\n"
206
- analysis += f"- Characters: {char_count}\n"
207
- analysis += f"- Sentences: {sentence_count}\n"
208
-
209
- if numbers:
210
- analysis += f"- Numbers found: {', '.join(numbers[:10])}{'...' if len(numbers) > 10 else ''}\n"
211
-
212
- if date_patterns:
213
- analysis += f"- Dates found: {', '.join(date_patterns)}\n"
214
-
215
- if emails:
216
- analysis += f"- Emails found: {', '.join(emails)}\n"
217
-
218
- return analysis
219
-
220
- except Exception as e:
221
- return f"Text analysis error: {e}"
222
-
223
- # --- Enhanced Agent ---
224
- class ImprovedWebSearchAgent:
225
- def __init__(self):
226
- """Initialize the enhanced agent with better reasoning capabilities."""
227
-
228
- # Use more powerful model if available
229
- model_name = "gpt-4o-mini" # Fallback to gpt-3.5-turbo if needed
230
-
231
- # Enhanced system prompt for better reasoning
232
- system_prompt = """You are an advanced AI assistant designed to solve complex problems by breaking them down systematically.
233
-
234
- Key capabilities:
235
- 1. **Multi-step Reasoning**: Break complex problems into smaller, manageable steps
236
- 2. **Information Synthesis**: Combine information from multiple sources
237
- 3. **Verification**: Double-check calculations and facts
238
- 4. **Context Awareness**: Understand the broader context of questions
239
-
240
- Problem-solving approach:
241
- 1. Analyze the question carefully to understand what's being asked
242
- 2. Identify what information you need to find
243
- 3. Use available tools strategically (search, calculate, analyze)
244
- 4. Verify your findings and reasoning
245
- 5. Provide a clear, accurate answer
246
-
247
- When using tools:
248
- - Use focused_search for specific factual information
249
- - Use duck_search for broader context
250
- - Use advanced_calculator for any mathematical operations
251
- - Use date_calculator for time-related queries
252
- - Use text_analyzer when you need to extract information from text
253
-
254
- Always think step-by-step and explain your reasoning process."""
255
-
256
  try:
257
- self.agent = ToolCallingAgent(
258
- name="ImprovedGAIAAgent",
259
- description=system_prompt,
260
- tools=[duck_search, focused_search, advanced_calculator, date_calculator, text_analyzer],
261
- model=model_name,
262
- planning_interval=3, # More frequent planning
263
- )
264
- print(f"✅ Enhanced agent initialized with {model_name}")
265
- except Exception as e:
266
- print(f"⚠️ Error initializing with {model_name}, trying fallback...")
267
- try:
268
- self.agent = ToolCallingAgent(
269
- name="ImprovedGAIAAgent",
270
- description=system_prompt,
271
- tools=[duck_search, focused_search, advanced_calculator, date_calculator, text_analyzer],
272
- model="gpt-3.5-turbo",
273
- planning_interval=3,
274
- )
275
- print("✅ Enhanced agent initialized with gpt-3.5-turbo")
276
- except Exception as e2:
277
- print(f" Agent initialization failed: {e2}")
278
- raise e2
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  def __call__(self, question: str) -> str:
281
- """
282
- Process a question with enhanced reasoning and error handling.
283
-
284
- Args:
285
- question: The question to answer
286
-
287
- Returns:
288
- A comprehensive answer
289
- """
290
- print(f"🔍 Processing question: {question}")
291
 
292
  try:
293
- # Add some preprocessing to understand question type
294
- question_lower = question.lower()
295
-
296
- # Enhance the question with context clues
297
- enhanced_question = self._enhance_question(question)
298
-
299
- # Run the agent with timeout protection
300
- start_time = time.time()
301
- max_time = 120 # 2 minutes max per question
302
-
303
- result = self.agent.run(enhanced_question)
304
-
305
- elapsed_time = time.time() - start_time
306
- print(f"⏱️ Question processed in {elapsed_time:.1f} seconds")
307
-
308
- # Post-process the result
309
- final_answer = self._post_process_answer(result, question)
310
-
311
- return final_answer
312
-
313
  except Exception as e:
314
- print(f"Agent error: {e}")
315
- # Try a simpler approach as fallback
316
- return self._fallback_answer(question, str(e))
317
-
318
- def _enhance_question(self, question: str) -> str:
319
- """Add context and instructions to improve question processing."""
320
-
321
- enhanced = f"""Please solve this step by step:
322
-
323
- Question: {question}
324
-
325
- Instructions:
326
- 1. Read the question carefully and identify what type of answer is needed
327
- 2. Break down complex problems into steps
328
- 3. Use the available tools to gather information or perform calculations
329
- 4. Verify your answer makes sense
330
- 5. Provide a clear, concise final answer
331
 
332
- If this is a factual question, search for current information.
333
- If this involves calculations, show your work.
334
- If this requires multiple steps, explain each step clearly."""
335
-
336
- return enhanced
337
-
338
- def _post_process_answer(self, result: str, original_question: str) -> str:
339
- """Clean and improve the agent's response."""
340
-
341
- if not result or len(result.strip()) < 10:
342
- return f"I need more information to properly answer: {original_question}"
343
-
344
- # Clean up the response
345
- result = result.strip()
346
-
347
- # Ensure we have a clear answer
348
- if "final answer" not in result.lower() and "answer:" not in result.lower():
349
- # Try to extract the most relevant part
350
- lines = result.split('\n')
351
- if lines:
352
- # Look for the most substantive line as the answer
353
- best_line = max(lines, key=len, default=result)
354
- if len(best_line) > 20:
355
- result = f"{result}\n\nFinal Answer: {best_line}"
356
-
357
- return result
358
-
359
- def _fallback_answer(self, question: str, error: str) -> str:
360
- """Provide a fallback response when the main agent fails."""
361
-
362
- question_lower = question.lower()
363
-
364
- # Try simple keyword-based responses for common question types
365
- if any(word in question_lower for word in ['calculate', 'math', '+', '-', '*', '/', 'equals']):
366
- return f"This appears to be a mathematical question. Error occurred: {error}. Please verify the calculation manually."
367
-
368
- elif any(word in question_lower for word in ['when', 'date', 'year', 'time']):
369
- return f"This appears to be a date/time related question. Error occurred: {error}. Please search for current information."
370
-
371
- elif any(word in question_lower for word in ['who', 'what', 'where', 'how']):
372
- return f"This appears to be a factual question. Error occurred: {error}. Please search for current information."
373
-
374
- else:
375
- return f"I encountered an error while processing your question: {error}. Please try rephrasing your question."
376
-
377
- # --- Constants ---
378
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
379
 
380
- # --- Evaluation & Submission ---
381
- def run_and_submit_all(profile: gr.OAuthProfile | None):
 
 
382
  space_id = os.getenv("SPACE_ID")
383
- if profile:
384
- username = profile.username
385
- print(f"👤 User: {username}")
386
- else:
387
- return "Please login to Hugging Face.", None
388
-
389
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
390
- questions_url = f"{DEFAULT_API_URL}/questions"
391
- submit_url = f"{DEFAULT_API_URL}/submit"
392
-
393
- try:
394
- agent = ImprovedWebSearchAgent()
395
- except Exception as e:
396
- return f"Agent initialization error: {e}", None
397
-
398
  try:
399
- response = requests.get(questions_url, timeout=15)
400
- response.raise_for_status()
401
  questions = response.json()
402
  if not questions:
403
- return "No questions received.", None
404
-
405
- print(f"📝 Received {len(questions)} questions")
406
-
407
  except Exception as e:
408
- return f"Failed to fetch questions: {e}", None
409
-
410
- results_log = []
411
- answers_payload = []
412
 
413
- for i, item in enumerate(questions, 1):
 
 
 
414
  task_id = item.get("task_id")
415
  question = item.get("question")
416
-
417
  if not task_id or not question:
418
  continue
419
 
420
- print(f"\n📋 Processing question {i}/{len(questions)}: {task_id}")
421
-
422
- try:
423
- answer = agent(question)
424
-
425
- # Ensure answer is not empty
426
- if not answer or len(answer.strip()) < 2:
427
- answer = "Unable to determine answer from available information."
428
-
429
- results_log.append({
430
- "Task ID": task_id,
431
- "Question": question[:100] + "..." if len(question) > 100 else question,
432
- "Submitted Answer": answer[:200] + "..." if len(answer) > 200 else answer
433
- })
434
-
435
- answers_payload.append({
436
- "task_id": task_id,
437
- "submitted_answer": answer
438
- })
439
-
440
- print(f"✅ Answer generated for {task_id}")
441
-
442
- except Exception as e:
443
- error_msg = f"Agent error: {str(e)[:100]}"
444
- print(f"❌ Error for {task_id}: {error_msg}")
445
-
446
- results_log.append({
447
- "Task ID": task_id,
448
- "Question": question[:100] + "..." if len(question) > 100 else question,
449
- "Submitted Answer": error_msg
450
- })
451
-
452
- answers_payload.append({
453
- "task_id": task_id,
454
- "submitted_answer": "Error processing question"
455
- })
456
-
457
- if not answers_payload:
458
- return "No answers were generated.", pd.DataFrame(results_log)
459
-
460
- print(f"\n🚀 Submitting {len(answers_payload)} answers...")
461
-
462
  try:
463
- response = requests.post(submit_url, json={
464
- "username": username.strip(),
465
- "agent_code": agent_code,
466
- "answers": answers_payload
467
- }, timeout=120) # Increased timeout
468
-
469
- response.raise_for_status()
470
- result = response.json()
471
-
472
- score = result.get('score', 0)
473
- correct_count = result.get('correct_count', 0)
474
- total_attempted = result.get('total_attempted', len(answers_payload))
475
-
476
- status = (
477
- f"✅ Submission Successful!\n"
478
- f"User: {result.get('username')}\n"
479
- f"Score: {score}% ({correct_count}/{total_attempted} correct)\n"
480
- f"Message: {result.get('message', 'No message')}\n"
481
- f"Total questions processed: {len(questions)}"
482
  )
483
-
484
- print(f"🎯 Final Score: {score}%")
485
-
486
- return status, pd.DataFrame(results_log)
487
-
 
 
488
  except Exception as e:
489
- error_msg = f"Submission failed: {e}"
490
- print(error_msg)
491
- return error_msg, pd.DataFrame(results_log)
492
 
493
- # --- UI ---
494
- with gr.Blocks(title="Enhanced GAIA Agent") as demo:
495
- gr.Markdown("# 🤖 Enhanced GAIA Agent with Advanced Reasoning")
496
- gr.Markdown("""
497
- **Improvements in this version:**
498
- - 🧠 Enhanced multi-step reasoning capabilities
499
- - 🔍 Multiple specialized search tools
500
- - 🧮 Advanced calculator with better math support
501
- - 📅 Date and time calculation tools
502
- - 📝 Text analysis capabilities
503
- - ⚡ Better error handling and fallback mechanisms
504
- - 🎯 Optimized for GAIA benchmark performance
505
- """)
506
 
507
- gr.LoginButton()
 
508
 
509
- with gr.Row():
510
- run_btn = gr.Button("🚀 Run Enhanced Evaluation & Submit", variant="primary", scale=2)
511
 
512
- status_box = gr.Textbox(label="📊 Status & Results", lines=8, interactive=False)
513
- result_table = gr.DataFrame(label="📋 Agent Answers Log", interactive=False)
514
-
515
- run_btn.click(
516
- fn=run_and_submit_all,
517
- outputs=[status_box, result_table],
518
- show_progress=True
519
  )
520
 
521
  if __name__ == "__main__":
522
- demo.launch(debug=True, share=False)
 
5
  from smolagents import ToolCallingAgent, tool
6
  from duckduckgo_search import DDGS
7
  import math
8
+ from datetime import datetime
9
  import re
 
 
 
10
 
11
  # --- Enhanced Tools ---
12
  @tool
13
+ def enhanced_search(query: str, num_results: int = 3) -> str:
14
+ """Improved web search with result filtering"""
 
 
 
 
 
 
 
 
15
  try:
16
  with DDGS() as ddgs:
17
+ results = ddgs.text(query, max_results=num_results)
18
+ filtered = [
19
+ f"## {r['title']}\n{r['body']}\nURL: {r['href']}"
20
+ for r in results
21
+ if len(r['body']) > 30 and not any(
22
+ kw in r['title'].lower()
23
+ for kw in ['advertisement', 'sponsored', 'ad', 'buy']
 
 
 
 
 
24
  )
25
+ ]
26
+ return "\n\n".join(filtered) if filtered else "No quality results found."
27
  except Exception as e:
28
  return f"Search error: {e}"
29
 
30
  @tool
31
+ def scientific_calculator(expression: str) -> str:
32
+ """Advanced calculator with math/science functions"""
33
+ allowed_names = {k: v for k, v in math.__dict__.items() if not k.startswith("__")}
 
 
 
 
 
 
 
 
34
  try:
35
+ result = eval(expression, {"__builtins__": {}}, allowed_names)
36
+ return str(round(result, 6)) if isinstance(result, float) else str(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  except Exception as e:
38
+ return f"Calculation error: {e}"
39
 
40
  @tool
41
+ def get_current_date() -> str:
42
+ """Returns current date and time"""
43
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  @tool
46
+ def unit_converter(amount: float, from_unit: str, to_unit: str) -> str:
47
+ """Converts between common units"""
48
+ conversions = {
49
+ ('miles', 'kilometers'): lambda x: x * 1.60934,
50
+ ('pounds', 'kilograms'): lambda x: x * 0.453592,
51
+ ('fahrenheit', 'celsius'): lambda x: (x - 32) * 5/9,
52
+ }
53
+ key = (from_unit.lower(), to_unit.lower())
54
+ if key in conversions:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  try:
56
+ result = conversions[key](float(amount))
57
+ return f"{round(result, 4)} {to_unit}"
58
+ except:
59
+ return "Invalid amount"
60
+ return f"Unsupported conversion: {from_unit} → {to_unit}"
61
+
62
+ # --- Agent Core ---
63
+ class GAIAAgent:
64
+ def __init__(self):
65
+ self.agent = ToolCallingAgent(
66
+ name="GAIA-HF-Agent",
67
+ description="Specialized agent for GAIA tasks",
68
+ tools=[enhanced_search, scientific_calculator, get_current_date, unit_converter],
69
+ model="gpt-4-turbo", # or "gpt-3.5-turbo" if unavailable
70
+ planning_interval=5,
71
+ max_iterations=10
72
+ )
73
+ self.session_history = []
74
+
75
+ def preprocess_question(self, question: str) -> str:
76
+ """Clean GAIA questions"""
77
+ question = re.sub(r'\[\d+\]', '', question) # Remove citations
78
+ question = question.replace("(a)", "").replace("(b)", "") # Remove options
79
+ return question.strip()
80
+
81
+ def postprocess_answer(self, answer: str) -> str:
82
+ """Extract most precise answer"""
83
+ # Extract numbers/dates from longer answers
84
+ numbers = re.findall(r'\d+\.?\d*', answer)
85
+ dates = re.findall(r'\d{4}-\d{2}-\d{2}', answer)
86
+ if dates:
87
+ return dates[-1]
88
+ if numbers:
89
+ return numbers[-1]
90
+ return answer[:500] # Limit length
91
 
92
  def __call__(self, question: str) -> str:
93
+ clean_q = self.preprocess_question(question)
94
+ print(f"Processing: {clean_q}")
 
 
 
 
 
 
 
 
95
 
96
  try:
97
+ answer = self.agent.run(clean_q)
98
+ processed = self.postprocess_answer(answer)
99
+ self.session_history.append((question, processed))
100
+ return processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  except Exception as e:
102
+ return f"Agent error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # --- HF Space Integration ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
106
 
107
+ def run_and_submit(profile: gr.OAuthProfile | None):
108
+ if not profile:
109
+ return "Please log in to submit", None
110
+
111
  space_id = os.getenv("SPACE_ID")
112
+ agent = GAIAAgent()
113
+
114
+ # Fetch questions
 
 
 
 
 
 
 
 
 
 
 
 
115
  try:
116
+ response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
 
117
  questions = response.json()
118
  if not questions:
119
+ return "No questions received", None
 
 
 
120
  except Exception as e:
121
+ return f"Failed to get questions: {e}", None
 
 
 
122
 
123
+ # Process questions
124
+ results = []
125
+ answers = []
126
+ for item in questions[:20]: # Limit to 20 for testing
127
  task_id = item.get("task_id")
128
  question = item.get("question")
 
129
  if not task_id or not question:
130
  continue
131
 
132
+ answer = agent(question)
133
+ results.append({
134
+ "Task ID": task_id,
135
+ "Question": question,
136
+ "Answer": answer
137
+ })
138
+ answers.append({
139
+ "task_id": task_id,
140
+ "submitted_answer": answer
141
+ })
142
+
143
+ # Submit answers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  try:
145
+ response = requests.post(
146
+ f"{DEFAULT_API_URL}/submit",
147
+ json={
148
+ "username": profile.username,
149
+ "agent_code": f"https://huggingface.co/spaces/{space_id}",
150
+ "answers": answers
151
+ },
152
+ timeout=60
 
 
 
 
 
 
 
 
 
 
 
153
  )
154
+ data = response.json()
155
+ return (
156
+ f"✅ Submitted {len(answers)} answers\n"
157
+ f"Score: {data.get('score', 'N/A')}%\n"
158
+ f"Correct: {data.get('correct_count', '?')}/{data.get('total_attempted', '?')}\n"
159
+ f"Message: {data.get('message', '')}",
160
+ pd.DataFrame(results)
161
  except Exception as e:
162
+ return f"Submission failed: {e}", pd.DataFrame(results)
 
 
163
 
164
+ # --- Gradio UI ---
165
+ with gr.Blocks(title="GAIA Agent") as demo:
166
+ gr.Markdown("## 🚀 GAIA Task Agent")
167
+ gr.Markdown("Login and click submit to run evaluation")
 
 
 
 
 
 
 
 
 
168
 
169
+ login = gr.LoginButton()
170
+ submit_btn = gr.Button("Run & Submit Answers", variant="primary")
171
 
172
+ status = gr.Textbox(label="Submission Status", interactive=False)
173
+ results = gr.DataFrame(label="Processed Answers")
174
 
175
+ submit_btn.click(
176
+ fn=run_and_submit,
177
+ inputs=None,
178
+ outputs=[status, results]
 
 
 
179
  )
180
 
181
  if __name__ == "__main__":
182
+ demo.launch(debug=True)