sirine1712 commited on
Commit
a3a13ef
·
verified ·
1 Parent(s): 4a0cb0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +293 -126
app.py CHANGED
@@ -4,84 +4,172 @@ import requests
4
  import pandas as pd
5
  import json
6
  import time
 
7
  from typing import Dict, List, Any, Optional
8
 
9
  # Config
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
- MODEL_NAME = "microsoft/DialoGPT-medium" # Better conversational model
12
  SPACE_ID = os.getenv("SPACE_ID", "sirine1712/Final_Assignment_Template")
13
  HF_TOKEN = os.getenv("HF_TOKEN")
14
 
15
- class HuggingFaceAPIAgent:
16
- """Enhanced Hugging Face Inference Agent with better question processing"""
17
 
18
  def __init__(self, model: str = MODEL_NAME):
19
  self.model = model
20
  self.api_url = f"https://api-inference.huggingface.co/models/{model}"
21
- self.headers = {"Authorization": f"Bearer {HF_TOKEN}"}
22
-
23
- def preprocess_question(self, question: str) -> str:
24
- """Preprocess question to improve model understanding"""
25
- # Add context markers for better comprehension
26
- processed = f"Question: {question.strip()}"
27
-
28
- # Handle specific question types
29
- if any(word in question.lower() for word in ['calculate', 'compute', 'math', 'number']):
30
- processed = f"Math problem: {question.strip()} Please provide the numerical answer."
31
- elif any(word in question.lower() for word in ['when', 'what year', 'date']):
32
- processed = f"Factual question about time: {question.strip()} Please provide the specific date or year."
33
- elif any(word in question.lower() for word in ['who', 'person', 'people']):
34
- processed = f"Question about people: {question.strip()} Please provide the name(s)."
35
- elif any(word in question.lower() for word in ['where', 'location', 'place']):
36
- processed = f"Location question: {question.strip()} Please provide the specific location."
37
- elif any(word in question.lower() for word in ['how many', 'count', 'quantity']):
38
- processed = f"Counting question: {question.strip()} Please provide the exact number."
39
-
40
- return processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- def postprocess_answer(self, raw_answer: str, question: str) -> str:
43
- """Clean and format the model's response"""
44
- if not raw_answer:
45
  return "Unable to generate answer"
46
-
47
- # Remove common prefixes/suffixes
48
- answer = raw_answer.strip()
49
- prefixes_to_remove = [
50
- "Question:", "Answer:", "Response:", "Output:",
51
- "The answer is:", "Based on the question:",
52
- "Math problem:", "Factual question about time:",
53
- "Question about people:", "Location question:",
54
- "Counting question:"
55
- ]
56
-
57
- for prefix in prefixes_to_remove:
58
- if answer.lower().startswith(prefix.lower()):
59
- answer = answer[len(prefix):].strip()
60
-
61
- # Extract specific answer patterns
62
- if any(word in question.lower() for word in ['calculate', 'compute', 'math']):
63
- # Try to extract numbers from the response
64
- import re
65
- numbers = re.findall(r'-?\d+\.?\d*', answer)
66
  if numbers:
67
- return numbers[-1] # Return the last number found
68
 
69
- # Limit answer length for conciseness
70
- if len(answer) > 200:
71
- sentences = answer.split('.')
72
- answer = sentences[0] + '.' if sentences else answer[:200]
 
73
 
74
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  def __call__(self, question: str) -> str:
77
  """Main method to process questions"""
78
- print(f" Processing question: {question[:80]}...")
 
 
 
 
79
 
80
  try:
81
- # Preprocess the question
82
- processed_question = self.preprocess_question(question)
 
 
 
83
 
84
- # Make API call with retry logic
85
  max_retries = 3
86
  for attempt in range(max_retries):
87
  try:
@@ -89,66 +177,101 @@ class HuggingFaceAPIAgent:
89
  self.api_url,
90
  headers=self.headers,
91
  json={
92
- "inputs": processed_question,
93
  "parameters": {
94
- "max_length": 150,
95
- "temperature": 0.3, # Lower temperature for more focused answers
96
- "do_sample": True,
97
- "top_p": 0.9
98
  }
99
  },
100
- timeout=15
101
  )
102
 
103
- if response.status_code == 503: # Model loading
104
- print(f" Model loading, waiting... (attempt {attempt + 1})")
105
- time.sleep(10)
 
 
 
 
 
 
 
 
 
 
106
  continue
107
 
108
  response.raise_for_status()
109
- output = response.json()
110
 
111
- # Extract generated text
112
- if isinstance(output, list) and len(output) > 0:
113
- raw_answer = output[0].get("generated_text", "")
114
- elif isinstance(output, dict):
115
- raw_answer = output.get("generated_text", "")
 
 
 
116
  else:
117
- raw_answer = str(output)
118
 
119
- # Postprocess the answer
120
- final_answer = self.postprocess_answer(raw_answer, question)
121
- print(f"✅ Generated answer: {final_answer[:60]}...")
122
  return final_answer
123
 
124
  except requests.exceptions.RequestException as e:
125
  if attempt == max_retries - 1:
126
- raise e
127
  print(f"⚠️ Request failed (attempt {attempt + 1}), retrying...")
128
- time.sleep(2)
129
 
130
  except Exception as e:
131
- error_msg = f"Error processing question: {str(e)}"
132
  print(f"❌ {error_msg}")
133
  return error_msg
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def run_and_submit_all(profile: gr.OAuthProfile | None):
136
  """Main function to run agent on all questions and submit results"""
137
  if not profile:
138
  return "❌ Please log in with your Hugging Face account first.", None
139
 
 
 
 
 
 
140
  username = profile.username or "anonymous"
141
  agent_code = f"https://huggingface.co/spaces/{SPACE_ID}/tree/main"
142
 
143
- print(f"🚀 Starting agent run for user: {username}")
 
144
 
145
  # Initialize the agent
146
- agent = HuggingFaceAPIAgent()
147
 
148
  # Fetch questions from GAIA API
149
  try:
150
  print("📥 Fetching questions from GAIA API...")
151
- questions_response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
152
  questions_response.raise_for_status()
153
  questions = questions_response.json()
154
  print(f"✅ Retrieved {len(questions)} questions")
@@ -160,31 +283,47 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
160
  # Process each question
161
  answers = []
162
  log_entries = []
 
163
 
164
  for i, q in enumerate(questions, 1):
165
- print(f"\n🔄 Processing question {i}/{len(questions)}")
 
166
  print(f"Task ID: {q.get('task_id', 'Unknown')}")
 
167
 
168
  try:
169
  # Get answer from agent
170
  answer = agent(q["question"])
 
 
 
 
 
 
 
171
  except Exception as e:
172
  answer = f"Error: {str(e)}"
173
- print(f"❌ Error processing question: {e}")
 
174
 
175
  # Prepare submission format
176
  answers.append({
177
  "task_id": q["task_id"],
178
- "submitted_answer": answer
179
  })
180
 
181
  # Log for display
182
  log_entries.append({
183
  "Task ID": q["task_id"],
184
- "Question": q["question"][:100] + "..." if len(q["question"]) > 100 else q["question"],
185
- "Submitted Answer": answer[:100] + "..." if len(str(answer)) > 100 else str(answer),
186
- "Status": "✅ Completed" if "Error:" not in str(answer) else "❌ Failed"
187
  })
 
 
 
 
 
188
 
189
  # Submit answers to GAIA scoring API
190
  try:
@@ -198,35 +337,40 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
198
  submit_response = requests.post(
199
  f"{DEFAULT_API_URL}/submit",
200
  json=submission_data,
201
- timeout=30
202
  )
203
  submit_response.raise_for_status()
204
  result = submit_response.json()
205
 
206
  print(f"✅ Submission successful!")
207
- print(f"Score: {result.get('score', 'N/A')}%")
208
 
209
  except Exception as e:
210
  error_msg = f"❌ Submission failed: {str(e)}"
211
  print(error_msg)
212
  return error_msg, pd.DataFrame(log_entries)
213
 
214
- # Format success message
215
  score = result.get('score', 'N/A')
216
  correct_count = result.get('correct_count', 'N/A')
217
  total_attempted = result.get('total_attempted', 'N/A')
218
  message = result.get('message', 'No additional message')
219
 
220
- success_message = f"""✅ **Submission Complete!**
221
 
222
  **📊 Results:**
223
  - **Score:** {score}%
224
  - **Correct Answers:** {correct_count}/{total_attempted}
225
- - **Total Questions:** {len(questions)}
 
226
 
227
- **📝 Message:** {message}
228
 
229
- **🎯 Target:** 30% ({"✅ ACHIEVED!" if isinstance(score, (int, float)) and score >= 30 else "Keep trying!"})
 
 
 
 
 
230
  """
231
 
232
  print(success_message)
@@ -236,62 +380,85 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
236
  def create_interface():
237
  """Create the Gradio interface"""
238
  with gr.Blocks(
239
- title="🤖 GAIA Challenge Agent",
240
- theme=gr.themes.Soft()
 
 
 
 
 
 
 
241
  ) as demo:
242
 
243
  gr.Markdown("""
244
- # 🤖 GAIA Challenge Agent
245
 
246
- An AI agent built to tackle the GAIA benchmark questions using Hugging Face models.
247
 
248
- **Target:** Achieve 30% accuracy on GAIA evaluation questions.
249
 
250
- **Instructions:**
251
- 1. Log in with your Hugging Face account
252
- 2. Click "🚀 Run Agent & Submit" to start the evaluation
253
- 3. Wait for the agent to process all questions and submit results
254
  """)
255
 
256
- # Login section
 
 
 
 
 
 
 
 
 
257
  gr.Markdown("### 🔐 Authentication")
258
- gr.LoginButton(value="Login with Hugging Face")
259
 
260
- # Control section
261
- gr.Markdown("### 🎮 Controls")
262
- with gr.Row():
263
- run_button = gr.Button(
264
- "🚀 Run Agent & Submit",
265
- variant="primary",
266
- size="lg"
267
- )
268
 
269
- # Results section
270
  gr.Markdown("### 📊 Results")
271
- status_output = gr.Textbox(
272
- label="📋 Status & Results",
273
- lines=8,
274
- max_lines=15,
275
- placeholder="Results will appear here after submission..."
276
- )
 
 
277
 
278
- gr.Markdown("### 📝 Detailed Log")
279
  results_table = gr.DataFrame(
280
- label="Agent Processing Log",
281
- headers=["Task ID", "Question", "Submitted Answer", "Status"],
282
- wrap=True
 
283
  )
284
 
285
  # Event handlers
286
  run_button.click(
287
  fn=run_and_submit_all,
288
- outputs=[status_output, results_table]
 
289
  )
290
 
291
  # Footer
292
  gr.Markdown("""
293
  ---
294
- **Note:** Make sure your `HF_TOKEN` is set in the Space secrets for API access.
 
 
 
 
 
295
  """)
296
 
297
  return demo
 
4
  import pandas as pd
5
  import json
6
  import time
7
+ import re
8
  from typing import Dict, List, Any, Optional
9
 
10
  # Config
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
+ MODEL_NAME = "google/flan-t5-large" # Free model that works well
13
  SPACE_ID = os.getenv("SPACE_ID", "sirine1712/Final_Assignment_Template")
14
  HF_TOKEN = os.getenv("HF_TOKEN")
15
 
16
+ class GAIAAgent:
17
+ """Specialized agent for GAIA benchmark questions with proper auth handling"""
18
 
19
  def __init__(self, model: str = MODEL_NAME):
20
  self.model = model
21
  self.api_url = f"https://api-inference.huggingface.co/models/{model}"
22
+ self.headers = self._get_headers()
23
+
24
+ def _get_headers(self) -> dict:
25
+ """Get proper headers with authentication"""
26
+ if not HF_TOKEN:
27
+ print("⚠️ WARNING: HF_TOKEN not found in environment variables")
28
+ return {"Content-Type": "application/json"}
29
+
30
+ return {
31
+ "Authorization": f"Bearer {HF_TOKEN}",
32
+ "Content-Type": "application/json"
33
+ }
34
+
35
+ def _test_api_access(self) -> bool:
36
+ """Test if we can access the HF API"""
37
+ try:
38
+ test_response = requests.post(
39
+ self.api_url,
40
+ headers=self.headers,
41
+ json={"inputs": "Test connection"},
42
+ timeout=10
43
+ )
44
+ if test_response.status_code == 401:
45
+ print("❌ Authentication failed - check HF_TOKEN")
46
+ return False
47
+ elif test_response.status_code == 503:
48
+ print("⏳ Model is loading...")
49
+ return True
50
+ else:
51
+ print("✅ API access confirmed")
52
+ return True
53
+ except Exception as e:
54
+ print(f"❌ API test failed: {e}")
55
+ return False
56
+
57
+ def classify_question_type(self, question: str) -> str:
58
+ """Classify question type for better processing"""
59
+ question_lower = question.lower()
60
+
61
+ # Mathematical/computational questions
62
+ if any(word in question_lower for word in [
63
+ 'calculate', 'compute', 'sum', 'multiply', 'divide', 'subtract',
64
+ 'average', 'mean', 'percentage', 'ratio', 'equation', 'formula',
65
+ 'math', 'arithmetic', 'algebra', '+', '-', '*', '/', '='
66
+ ]):
67
+ return "mathematical"
68
+
69
+ # Factual/knowledge questions
70
+ elif any(word in question_lower for word in [
71
+ 'who is', 'what is', 'when was', 'where is', 'which',
72
+ 'born', 'died', 'founded', 'invented', 'discovered',
73
+ 'capital', 'president', 'author', 'wrote', 'directed'
74
+ ]):
75
+ return "factual"
76
+
77
+ # Counting/quantitative questions
78
+ elif any(word in question_lower for word in [
79
+ 'how many', 'count', 'number of', 'total', 'quantity'
80
+ ]):
81
+ return "counting"
82
+
83
+ # Date/time questions
84
+ elif any(word in question_lower for word in [
85
+ 'year', 'date', 'century', 'decade', 'month', 'day',
86
+ 'age', 'old', 'recent', 'latest', 'first time', 'last time'
87
+ ]):
88
+ return "temporal"
89
+
90
+ else:
91
+ return "general"
92
+
93
+ def format_prompt_by_type(self, question: str, question_type: str) -> str:
94
+ """Format prompt based on question type for T5 model"""
95
+
96
+ if question_type == "mathematical":
97
+ return f"solve: {question}"
98
+
99
+ elif question_type == "factual":
100
+ return f"question: {question}"
101
+
102
+ elif question_type == "counting":
103
+ return f"count: {question}"
104
+
105
+ elif question_type == "temporal":
106
+ return f"when: {question}"
107
+
108
+ else:
109
+ return f"answer: {question}"
110
 
111
+ def extract_clean_answer(self, raw_response: str, question: str, question_type: str) -> str:
112
+ """Extract and clean the answer from model response"""
113
+ if not raw_response or len(raw_response.strip()) == 0:
114
  return "Unable to generate answer"
115
+
116
+ # Clean the response
117
+ response = raw_response.strip()
118
+
119
+ # For T5 models, often the response is already clean
120
+ # Remove common artifacts
121
+ response = re.sub(r'^(answer:|solution:|result:)\s*', '', response, flags=re.IGNORECASE)
122
+
123
+ # Extract specific patterns based on question type
124
+ if question_type == "mathematical":
125
+ # Try to extract numerical answer
126
+ numbers = re.findall(r'-?\d+\.?\d*', response)
127
+ if numbers:
128
+ return str(numbers[-1]) # Return the last number found
129
+
130
+ elif question_type == "counting":
131
+ # Extract the first number found
132
+ numbers = re.findall(r'\d+', response)
 
 
133
  if numbers:
134
+ return str(numbers[0])
135
 
136
+ elif question_type == "temporal":
137
+ # Look for years, dates
138
+ years = re.findall(r'\b(19|20)\d{2}\b', response)
139
+ if years:
140
+ return str(years[0])
141
 
142
+ dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', response)
143
+ if dates:
144
+ return str(dates[0])
145
+
146
+ # Clean up the response length
147
+ sentences = response.split('.')
148
+ if len(sentences) > 0 and len(sentences[0]) > 5:
149
+ clean_answer = sentences[0].strip()
150
+ if len(clean_answer) > 100:
151
+ clean_answer = clean_answer[:100] + "..."
152
+ return clean_answer
153
+
154
+ # Fallback: return first 100 characters
155
+ return response[:100] + "..." if len(response) > 100 else response
156
 
157
  def __call__(self, question: str) -> str:
158
  """Main method to process questions"""
159
+ print(f"🔍 Processing: {question[:60]}...")
160
+
161
+ # Check API access first
162
+ if not self._test_api_access():
163
+ return "API authentication failed - check HF_TOKEN"
164
 
165
  try:
166
+ # Classify and format the question
167
+ question_type = self.classify_question_type(question)
168
+ formatted_prompt = self.format_prompt_by_type(question, question_type)
169
+
170
+ print(f"📝 Question type: {question_type}")
171
 
172
+ # Make API call with retries
173
  max_retries = 3
174
  for attempt in range(max_retries):
175
  try:
 
177
  self.api_url,
178
  headers=self.headers,
179
  json={
180
+ "inputs": formatted_prompt,
181
  "parameters": {
182
+ "max_new_tokens": 100,
183
+ "temperature": 0.1, # Very low temperature for precise answers
184
+ "do_sample": False, # Deterministic output
185
+ "return_full_text": False
186
  }
187
  },
188
+ timeout=20
189
  )
190
 
191
+ if response.status_code == 401:
192
+ return "Authentication error - invalid HF_TOKEN"
193
+
194
+ elif response.status_code == 503: # Model loading
195
+ wait_time = 15 + (attempt * 10)
196
+ print(f"⏳ Model loading, waiting {wait_time}s... (attempt {attempt + 1})")
197
+ time.sleep(wait_time)
198
+ continue
199
+
200
+ elif response.status_code == 429: # Rate limit
201
+ wait_time = 5 + (attempt * 5)
202
+ print(f"⏳ Rate limited, waiting {wait_time}s...")
203
+ time.sleep(wait_time)
204
  continue
205
 
206
  response.raise_for_status()
207
+ result = response.json()
208
 
209
+ # Extract the generated text
210
+ if isinstance(result, list) and len(result) > 0:
211
+ if 'generated_text' in result[0]:
212
+ raw_answer = result[0]['generated_text']
213
+ else:
214
+ raw_answer = str(result[0])
215
+ elif isinstance(result, dict):
216
+ raw_answer = result.get('generated_text', str(result))
217
  else:
218
+ raw_answer = str(result)
219
 
220
+ # Clean and extract the final answer
221
+ final_answer = self.extract_clean_answer(raw_answer, question, question_type)
222
+ print(f"✅ Answer: {final_answer}")
223
  return final_answer
224
 
225
  except requests.exceptions.RequestException as e:
226
  if attempt == max_retries - 1:
227
+ return f"Request failed after {max_retries} attempts: {str(e)}"
228
  print(f"⚠️ Request failed (attempt {attempt + 1}), retrying...")
229
+ time.sleep(3)
230
 
231
  except Exception as e:
232
+ error_msg = f"Processing error: {str(e)}"
233
  print(f"❌ {error_msg}")
234
  return error_msg
235
 
236
+ def check_environment():
237
+ """Check if environment is properly configured"""
238
+ issues = []
239
+
240
+ if not HF_TOKEN:
241
+ issues.append("❌ HF_TOKEN not found in environment variables")
242
+ else:
243
+ issues.append("✅ HF_TOKEN found")
244
+
245
+ if not SPACE_ID:
246
+ issues.append("❌ SPACE_ID not configured")
247
+ else:
248
+ issues.append(f"✅ SPACE_ID: {SPACE_ID}")
249
+
250
+ return "\n".join(issues)
251
+
252
  def run_and_submit_all(profile: gr.OAuthProfile | None):
253
  """Main function to run agent on all questions and submit results"""
254
  if not profile:
255
  return "❌ Please log in with your Hugging Face account first.", None
256
 
257
+ # Check environment
258
+ env_status = check_environment()
259
+ if "❌" in env_status:
260
+ return f"Environment check failed:\n{env_status}", None
261
+
262
  username = profile.username or "anonymous"
263
  agent_code = f"https://huggingface.co/spaces/{SPACE_ID}/tree/main"
264
 
265
+ print(f"🚀 Starting GAIA evaluation for user: {username}")
266
+ print(f"🔧 Environment status:\n{env_status}")
267
 
268
  # Initialize the agent
269
+ agent = GAIAAgent()
270
 
271
  # Fetch questions from GAIA API
272
  try:
273
  print("📥 Fetching questions from GAIA API...")
274
+ questions_response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
275
  questions_response.raise_for_status()
276
  questions = questions_response.json()
277
  print(f"✅ Retrieved {len(questions)} questions")
 
283
  # Process each question
284
  answers = []
285
  log_entries = []
286
+ successful_answers = 0
287
 
288
  for i, q in enumerate(questions, 1):
289
+ print(f"\n{'='*60}")
290
+ print(f"🔄 Question {i}/{len(questions)}")
291
  print(f"Task ID: {q.get('task_id', 'Unknown')}")
292
+ print(f"Question: {q['question']}")
293
 
294
  try:
295
  # Get answer from agent
296
  answer = agent(q["question"])
297
+
298
+ if not answer.startswith(("Error:", "Authentication error", "API authentication failed")):
299
+ successful_answers += 1
300
+ status = "✅ Success"
301
+ else:
302
+ status = "❌ Failed"
303
+
304
  except Exception as e:
305
  answer = f"Error: {str(e)}"
306
+ status = "❌ Exception"
307
+ print(f"❌ Exception processing question: {e}")
308
 
309
  # Prepare submission format
310
  answers.append({
311
  "task_id": q["task_id"],
312
+ "submitted_answer": str(answer)
313
  })
314
 
315
  # Log for display
316
  log_entries.append({
317
  "Task ID": q["task_id"],
318
+ "Question": q["question"][:80] + "..." if len(q["question"]) > 80 else q["question"],
319
+ "Answer": str(answer)[:60] + "..." if len(str(answer)) > 60 else str(answer),
320
+ "Status": status
321
  })
322
+
323
+ print(f"Answer: {answer}")
324
+ print(f"Status: {status}")
325
+
326
+ print(f"\n📊 Processing complete: {successful_answers}/{len(questions)} successful")
327
 
328
  # Submit answers to GAIA scoring API
329
  try:
 
337
  submit_response = requests.post(
338
  f"{DEFAULT_API_URL}/submit",
339
  json=submission_data,
340
+ timeout=60
341
  )
342
  submit_response.raise_for_status()
343
  result = submit_response.json()
344
 
345
  print(f"✅ Submission successful!")
 
346
 
347
  except Exception as e:
348
  error_msg = f"❌ Submission failed: {str(e)}"
349
  print(error_msg)
350
  return error_msg, pd.DataFrame(log_entries)
351
 
352
+ # Format results
353
  score = result.get('score', 'N/A')
354
  correct_count = result.get('correct_count', 'N/A')
355
  total_attempted = result.get('total_attempted', 'N/A')
356
  message = result.get('message', 'No additional message')
357
 
358
+ success_message = f"""✅ **GAIA Evaluation Complete!**
359
 
360
  **📊 Results:**
361
  - **Score:** {score}%
362
  - **Correct Answers:** {correct_count}/{total_attempted}
363
+ - **Questions Processed:** {len(questions)}
364
+ - **Successful API Calls:** {successful_answers}/{len(questions)}
365
 
366
+ **🎯 Target Progress:** {"✅ TARGET ACHIEVED!" if isinstance(score, (int, float)) and score >= 30.0 else f"Need {30.0 - (score if isinstance(score, (int, float)) else 0):.1f}% more to reach 30%"}
367
 
368
+ **📝 System Message:** {message}
369
+
370
+ **💡 Tips for improvement:**
371
+ - Ensure HF_TOKEN has proper permissions
372
+ - Try running again if API calls failed
373
+ - Check question types that performed poorly
374
  """
375
 
376
  print(success_message)
 
380
  def create_interface():
381
  """Create the Gradio interface"""
382
  with gr.Blocks(
383
+ title="🎯 GAIA Challenge Agent",
384
+ theme=gr.themes.Soft(),
385
+ css="""
386
+ .status-box {
387
+ background: #f8f9fa;
388
+ border-left: 4px solid #007bff;
389
+ padding: 15px;
390
+ }
391
+ """
392
  ) as demo:
393
 
394
  gr.Markdown("""
395
+ # 🎯 GAIA Challenge Agent
396
 
397
+ **Goal:** Achieve 30% accuracy on the GAIA benchmark
398
 
399
+ This agent uses Google's FLAN-T5-Large model with specialized question processing to tackle GAIA's challenging questions.
400
 
401
+ **Setup Required:**
402
+ 1. Set `HF_TOKEN` in your Space secrets (Settings → Repository secrets)
403
+ 2. Set `SPACE_ID` to your space name (e.g., "username/space-name")
 
404
  """)
405
 
406
+ # Environment check
407
+ with gr.Accordion("🔧 Environment Check", open=False):
408
+ env_check = gr.Textbox(
409
+ value=check_environment(),
410
+ label="Environment Status",
411
+ lines=3,
412
+ interactive=False
413
+ )
414
+
415
+ # Authentication
416
  gr.Markdown("### 🔐 Authentication")
417
+ gr.LoginButton(value="🔑 Login with Hugging Face")
418
 
419
+ # Main controls
420
+ gr.Markdown("### 🚀 Run Evaluation")
421
+ run_button = gr.Button(
422
+ "🎯 Start GAIA Evaluation",
423
+ variant="primary",
424
+ size="lg"
425
+ )
 
426
 
427
+ # Results
428
  gr.Markdown("### 📊 Results")
429
+ with gr.Row():
430
+ status_output = gr.Textbox(
431
+ label="📋 Evaluation Results",
432
+ lines=12,
433
+ max_lines=20,
434
+ placeholder="Click 'Start GAIA Evaluation' to begin...",
435
+ elem_classes=["status-box"]
436
+ )
437
 
438
+ gr.Markdown("### 📝 Question Processing Log")
439
  results_table = gr.DataFrame(
440
+ label="Detailed Processing Results",
441
+ headers=["Task ID", "Question", "Answer", "Status"],
442
+ wrap=True,
443
+ max_height=400
444
  )
445
 
446
  # Event handlers
447
  run_button.click(
448
  fn=run_and_submit_all,
449
+ outputs=[status_output, results_table],
450
+ show_progress=True
451
  )
452
 
453
  # Footer
454
  gr.Markdown("""
455
  ---
456
+ **🔍 Troubleshooting:**
457
+ - **401 Error:** Check that HF_TOKEN is valid and set in Space secrets
458
+ - **503 Error:** Model is loading, wait and try again
459
+ - **0% Score:** Check answer format and question processing logic
460
+
461
+ **📚 Model:** google/flan-t5-large (instruction-tuned for better reasoning)
462
  """)
463
 
464
  return demo