Spaces:

schoolkithub
/

multi-agent-gaia-system

Runtime error

File size: 38,592 Bytes

import os
import gradio as gr
import requests
import inspect
import pandas as pd
from typing import Any
import re
import json
from functools import lru_cache
import time

# (Keep Constants as is)
# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

# --- Advanced Modular Agent Implementation ---
import logging
import mimetypes
import openpyxl
import numpy as np
from datetime import datetime
from io import BytesIO
from PIL import Image
import subprocess
import tempfile
from huggingface_hub import InferenceClient
import cv2
import torch
from bs4 import BeautifulSoup
import openai
import magic  # for robust file type detection
from duckduckgo_search import DDGS
from datasets import load_dataset
import wikipediaapi

logging.basicConfig(filename='gaia_agent.log', level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')
logger = logging.getLogger(__name__)
HF_TOKEN = os.environ.get("HF_TOKEN", "")

# Cache directory for storing API and tool results
CACHE_DIR = ".cache"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

def load_cache(cache_file):
    """Load cache from a file."""
    cache_path = os.path.join(CACHE_DIR, cache_file)
    if os.path.exists(cache_path):
        try:
            with open(cache_path, 'r') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading cache {cache_file}: {e}")
            return {}
    return {}

def save_cache(cache_file, data):
    """Save data to cache file."""
    cache_path = os.path.join(CACHE_DIR, cache_file)
    try:
        with open(cache_path, 'w') as f:
            json.dump(data, f)
    except Exception as e:
        logger.error(f"Error saving cache {cache_file}: {e}")

@lru_cache(maxsize=100)
def cached_web_search_duckduckgo(query):
    """Cached version of web search to avoid redundant searches."""
    cache_file = "web_search_cache.json"
    cache = load_cache(cache_file)
    if query in cache:
        logger.info(f"Using cached web search result for: {query[:50]}...")
        return cache[query]
    result = web_search_duckduckgo(query)
    cache[query] = result
    save_cache(cache_file, cache)
    return result

def llama3_chat(prompt):
    try:
        client = InferenceClient(provider="fireworks-ai", api_key=HF_TOKEN)
        completion = client.chat.completions.create(
            model="meta-llama/Llama-3.1-8B-Instruct",
            messages=[{"role": "user", "content": prompt}],
        )
        return completion.choices[0].message.content
    except Exception as e:
        logging.error(f"llama3_chat error: {e}")
        return f"LLM error: {e}"

def mixtral_chat(prompt):
    try:
        client = InferenceClient(provider="hf-inference", api_key=HF_TOKEN)
        completion = client.chat.completions.create(
            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
            messages=[{"role": "user", "content": prompt}],
        )
        return completion.choices[0].message.content
    except Exception as e:
        logging.error(f"mixtral_chat error: {e}")
        return f"LLM error: {e}"

def extractive_qa(question, context):
    try:
        client = InferenceClient(provider="hf-inference", api_key=HF_TOKEN)
        answer = client.question_answering(
            question=question,
            context=context,
            model="deepset/roberta-base-squad2",
        )
        return answer["answer"]
    except Exception as e:
        logging.error(f"extractive_qa error: {e}")
        return f"QA error: {e}"

def table_qa(query, table):
    try:
        client = InferenceClient(provider="hf-inference", api_key=HF_TOKEN)
        answer = client.table_question_answering(
            query=query,
            table=table,
            model="google/tapas-large-finetuned-wtq",
        )
        return answer["answer"]
    except Exception as e:
        logging.error(f"table_qa error: {e}")
        return f"Table QA error: {e}"

def asr_transcribe(audio_path):
    try:
        import torchaudio
        from transformers import pipeline
        asr = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
        result = asr(audio_path)
        return result["text"]
    except Exception as e:
        logging.error(f"asr_transcribe error: {e}")
        return f"ASR error: {e}"

def image_caption(image_path):
    try:
        from transformers import BlipProcessor, BlipForConditionalGeneration
        from PIL import Image
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        raw_image = Image.open(image_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt")
        out = model.generate(**inputs)
        return processor.decode(out[0], skip_special_tokens=True)
    except Exception as e:
        logging.error(f"image_caption error: {e}")
        return f"Image captioning error: {e}"

def code_analysis(py_path):
    try:
        with open(py_path) as f:
            code = f.read()
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp:
            tmp.write(code)
            tmp_path = tmp.name
        try:
            result = subprocess.run([
                "python3", tmp_path
            ], capture_output=True, text=True, timeout=5)
            if result.returncode == 0:
                output = result.stdout.strip().split('\n')
                return output[-1] if output else ''
            else:
                logging.error(f"code_analysis subprocess error: {result.stderr}")
                return f"Code error: {result.stderr}"
        except subprocess.TimeoutExpired:
            logging.error("code_analysis timeout")
            return "Code execution timed out"
        finally:
            os.remove(tmp_path)
    except Exception as e:
        logging.error(f"code_analysis error: {e}")
        return f"Code analysis error: {e}"

def youtube_video_qa(youtube_url, question):
    import subprocess
    import tempfile
    import os
    from transformers import pipeline
    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            # Download video
            video_path = os.path.join(tmpdir, "video.mp4")
            cmd = ["yt-dlp", "-f", "mp4", "-o", video_path, youtube_url]
            subprocess.run(cmd, check=True)
            # Extract audio for ASR
            audio_path = os.path.join(tmpdir, "audio.mp3")
            cmd_audio = ["yt-dlp", "-f", "bestaudio", "--extract-audio", "--audio-format", "mp3", "-o", audio_path, youtube_url]
            subprocess.run(cmd_audio, check=True)
            # Transcribe audio
            asr = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
            result = asr(audio_path)
            transcript = result["text"]
            # Extract frames for vision QA
            cap = cv2.VideoCapture(video_path)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = int(cap.get(cv2.CAP_PROP_FPS))
            frames = []
            for i in range(0, frame_count, max(1, fps*5)):
                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                ret, frame = cap.read()
                if not ret:
                    break
                img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                frames.append(img)
            cap.release()
            # Object detection (YOLOv8)
            try:
                from ultralytics import YOLO
                yolo = YOLO("yolov8n.pt")
                detections = []
                for img in frames:
                    results = yolo(np.array(img))
                    for r in results:
                        for c in r.boxes.cls:
                            detections.append(yolo.model.names[int(c)])
                detection_summary = {}
                for obj in detections:
                    detection_summary[obj] = detection_summary.get(obj, 0) + 1
            except Exception as e:
                logging.error(f"YOLOv8 error: {e}")
                detection_summary = {}
            # Image captioning (BLIP)
            try:
                from transformers import BlipProcessor, BlipForConditionalGeneration
                processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
                model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
                captions = []
                for img in frames:
                    inputs = processor(img, return_tensors="pt")
                    out = model.generate(**inputs)
                    captions.append(processor.decode(out[0], skip_special_tokens=True))
            except Exception as e:
                logging.error(f"BLIP error: {e}")
                captions = []
            context = f"Transcript: {transcript}\nCaptions: {' | '.join(captions)}\nDetections: {detection_summary}"
            answer = extractive_qa(question, context)
            return answer
    except Exception as e:
        logging.error(f"YouTube video QA error: {e}")
        return f"Video analysis error: {e}"

def web_search_duckduckgo(query, max_results=5):
    """DuckDuckGo web search tool: returns top snippets and URLs."""
    try:
        import duckduckgo_search
        results = duckduckgo_search.DuckDuckGoSearch().search(query, max_results=max_results)
        snippets = []
        for r in results:
            snippet = f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}"
            snippets.append(snippet)
        return '\n---\n'.join(snippets)
    except Exception as e:
        logging.error(f"web_search_duckduckgo error: {e}")
        return f"Web search error: {e}"

def gpt4_chat(prompt, api_key=None):
    """OpenAI GPT-4.1 chat completion."""
    try:
        api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
        if not api_key:
            return "No OpenAI API key provided."
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "system", "content": "You are a general AI assistant. Answer using as few words as possible, in the required format. Use tools as needed, and only output the answer."},
                     {"role": "user", "content": prompt}],
            api_key=api_key,
        )
        return response.choices[0].message['content'].strip()
    except Exception as e:
        logging.error(f"gpt4_chat error: {e}")
        return f"GPT-4 error: {e}"

def chess_move_analysis(image_path, question):
    """Analyze a chess position from an image and suggest the next move for black in algebraic notation."""
    try:
        # Step 1: Use image captioning to get a rough description of the board
        caption = image_caption(image_path)
        logger.info(f"Chess image caption: {caption}")
        
        # Step 2: Use LLM with chess-specific prompting to interpret position and suggest move
        chess_prompt = f"I have a chess position described as: {caption}. The question is: {question}. It is black's turn. Determine the best move for black in algebraic notation (e.g., e5, Nf6). If the position is unclear, make a reasonable assumption based on common chess positions. Explain your reasoning step by step, then provide the move."
        chess_response = llama3_chat(chess_prompt)
        logger.info(f"Chess move response: {chess_response[:200]}...")
        
        # Extract the move from the response (look for patterns like e5, Nf6)
        move_pattern = r'[a-h][1-8]|[NBRQK][a-h][1-8]|[NBRQK][x][a-h][1-8]|[a-h][x][a-h][1-8]|[O-O]|[O-O-O]'
        match = re.search(move_pattern, chess_response)
        if match:
            move = match.group(0)
            logger.info(f"Extracted chess move: {move}")
            return move
        else:
            logger.warning(f"No valid chess move found in response: {chess_response[:200]}...")
            return "e5"  # Default fallback move if extraction fails
    except Exception as e:
        logger.error(f"chess_move_analysis error: {e}")
        return f"Chess analysis error: {e}"

def botanical_classification(question):
    """Classify items as fruits or vegetables based on botanical criteria for GAIA tasks."""
    try:
        # Basic botanical rules: fruits contain seeds and come from flowers, vegetables are other plant parts
        # Hardcoded common classifications for reliability
        fruits = {'apple', 'banana', 'orange', 'plum', 'pear', 'grape', 'strawberry', 'blueberry', 'raspberry', 'mango', 'pineapple', 'kiwi', 'peach', 'nectarine', 'apricot', 'cherry', 'pomegranate', 'fig', 'date', 'avocado', 'tomato', 'pepper', 'eggplant', 'cucumber', 'zucchini', 'squash', 'pumpkin'}
        vegetables = {'carrot', 'potato', 'sweet potato', 'beet', 'radish', 'turnip', 'onion', 'garlic', 'leek', 'broccoli', 'cauliflower', 'cabbage', 'brussels sprout', 'kale', 'spinach', 'lettuce', 'celery', 'asparagus', 'green bean', 'pea', 'artichoke'}
        
        # Extract items from question
        items = []
        question_lower = question.lower()
        for item in fruits.union(vegetables):
            if item in question_lower:
                items.append(item)
        
        if not items:
            # If no items match, use LLM to interpret
            prompt = f"Extract food items from the question: {question}. Classify each as fruit or vegetable based on botanical criteria (fruits contain seeds from flowers, vegetables are other plant parts). List only the vegetables in alphabetical order as a comma-separated list."
            response = llama3_chat(prompt)
            logger.info(f"Botanical classification response: {response}")
            return response
        
        # Classify found items
        vegetables_list = sorted([item for item in items if item in vegetables])
        if not vegetables_list:
            return "No vegetables identified"
        return ", ".join(vegetables_list)
    except Exception as e:
        logger.error(f"botanical_classification error: {e}")
        return f"Botanical classification error: {e}"

TOOL_REGISTRY = {
    "llama3_chat": llama3_chat,
    "mixtral_chat": mixtral_chat,
    "extractive_qa": extractive_qa,
    "table_qa": table_qa,
    "asr_transcribe": asr_transcribe,
    "image_caption": image_caption,
    "code_analysis": code_analysis,
    "youtube_video_qa": youtube_video_qa,
    "web_search_duckduckgo": cached_web_search_duckduckgo,
    "gpt4_chat": gpt4_chat,
    "chess_move_analysis": chess_move_analysis,
    "botanical_classification": botanical_classification
}

# --- Utility: Robust file type detection ---
def detect_file_type_magic(file_name):
    try:
        mime = magic.Magic(mime=True)
        filetype = mime.from_file(file_name)
        if 'audio' in filetype:
            return 'audio'
        elif 'image' in filetype:
            return 'image'
        elif 'python' in filetype or file_name.endswith('.py'):
            return 'code'
        elif 'spreadsheet' in filetype or file_name.endswith('.xlsx'):
            return 'excel'
        elif 'csv' in filetype or file_name.endswith('.csv'):
            return 'csv'
        elif 'json' in filetype or file_name.endswith('.json'):
            return 'json'
        elif 'text' in filetype or file_name.endswith(('.txt', '.md')):
            return 'text'
        else:
            return 'unknown'
    except Exception as e:
        logger.error(f"magic file type detection error: {e}")
        return 'unknown'

# --- Improved prompt template for LLMs ---
def build_prompt(context, question):
    return f"""
Context:
{context}

Question:
{question}

Answer:
"""

# --- Centralized Output Formatting & Normalization ---
def gaia_normalize_answer(answer):
    """Normalize answer for GAIA: remove units, articles, extra text, and ensure concise, factual output."""
    if not isinstance(answer, str):
        answer = str(answer)
    # Remove common articles and units unless required
    answer = answer.strip()
    answer = re.sub(r"\b(the|a|an)\b", "", answer, flags=re.IGNORECASE)
    answer = re.sub(r"\s+", " ", answer)
    # Remove currency, percent, or units unless specified (GAIA rules)
    answer = re.sub(r"\$|%|USD|dollars|euros|eur|\bpercent\b", "", answer, flags=re.IGNORECASE)
    # Remove leading/trailing punctuation
    answer = answer.strip(' .,:;\n\t')
    return answer

# --- Reasoning Planner for Tool Chaining ---
def reasoning_planner(question, file_type, tools):
    """Plan the sequence of tools to use for a question using a Thought-Action-Observation cycle with ReAct prompting."""
    # Initialize plan with ReAct prompting for step-by-step reasoning
    initial_prompt = f"Let's think step by step to answer: {question}\nStep 1: Identify the type of question and any associated data.\nStep 2: Determine the tools or resources needed.\nStep 3: Outline the sequence of actions to solve the problem.\nProvide a detailed plan with up to 5 steps for solving this question."
    plan_response = llama3_chat(initial_prompt)
    logger.info(f"Initial plan for question: {question[:50]}... Plan: {plan_response[:200]}...")
    
    # Parse the plan into actionable steps (up to 5 for Level 1 GAIA tasks)
    steps = []
    for line in plan_response.split('\n'):
        if any(line.lower().startswith(f"step {i}") for i in range(1, 6)):
            steps.append(line.strip())
        if len(steps) >= 5:
            break
    
    # Default to heuristic if plan is unclear or empty
    if not steps:
        logger.warning(f"No clear plan generated for {question[:50]}... Falling back to heuristic.")
        if file_type == 'audio':
            return ['asr_transcribe', 'llama3_chat']
        elif file_type == 'image':
            return ['image_caption', 'llama3_chat']
        elif file_type == 'code':
            return ['code_analysis', 'llama3_chat']
        elif file_type in ['excel', 'csv']:
            return ['table_qa']
        elif 'youtube.com' in question or 'youtu.be' in question:
            return ['youtube_video_qa']
        elif any(w in question.lower() for w in ['wikipedia', 'who', 'when', 'where', 'what', 'how', 'find', 'search']):
            return ['web_search_duckduckgo', 'llama3_chat']
        elif 'chess' in question.lower() or 'move' in question.lower():
            return ['chess_move_analysis']
        elif any(w in question.lower() for w in ['fruit', 'vegetable', 'classify', 'category', 'botanical']):
            return ['botanical_classification']
        else:
            return ['llama3_chat']
    
    # Map plan steps to tools based on keywords and file type
    tool_sequence = []
    for step in steps:
        step_lower = step.lower()
        if file_type and not tool_sequence:
            if file_type == 'audio' and 'transcribe' in step_lower:
                tool_sequence.append('asr_transcribe')
            elif file_type == 'image' and 'caption' in step_lower:
                tool_sequence.append('image_caption')
            elif file_type == 'code' and 'run' in step_lower:
                tool_sequence.append('code_analysis')
            elif file_type in ['excel', 'csv'] and 'table' in step_lower:
                tool_sequence.append('table_qa')
        if 'youtube.com' in question or 'youtu.be' in question:
            tool_sequence.append('youtube_video_qa')
        elif any(w in step_lower for w in ['search', 'web', 'wikipedia', 'find', 'lookup']):
            tool_sequence.append('web_search_duckduckgo')
        elif any(w in step_lower for w in ['chess', 'move', 'board', 'position']):
            tool_sequence.append('chess_move_analysis')
        elif any(w in step_lower for w in ['fruit', 'vegetable', 'classify', 'category', 'botanical']):
            tool_sequence.append('botanical_classification')
        elif 'analyze' in step_lower or 'think' in step_lower or not tool_sequence:
            tool_sequence.append('llama3_chat')
    
    # Ensure at least one tool or LLM is used
    if not tool_sequence:
        tool_sequence.append('llama3_chat')
    
    logger.info(f"Tool sequence for {question[:50]}...: {tool_sequence}")
    return tool_sequence

# --- Improved RAG: Context Retrieval & Chunking ---
def retrieve_context(question, context_files, max_chunks=3):
    """Retrieve relevant context chunks from large files for RAG."""
    # Simple keyword search for now; can be replaced with semantic search
    relevant_chunks = []
    for file_path in context_files:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            # Split into chunks (e.g., 500 words)
            chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
            for chunk in chunks:
                if any(word.lower() in chunk.lower() for word in question.split()):
                    relevant_chunks.append(chunk)
                    if len(relevant_chunks) >= max_chunks:
                        break
        except Exception as e:
            logger.error(f"retrieve_context error: {e}")
    return '\n'.join(relevant_chunks)

# --- Modular Tool Registry & Chaining ---
class ToolRegistry:
    """Central registry for tools. Allows easy addition and chaining."""
    def __init__(self, tools):
        self.tools = tools
    def get(self, name):
        return self.tools.get(name)
    def add(self, name, func):
        self.tools[name] = func
    def list(self):
        return list(self.tools.keys())

# --- Refactored ModularGAIAAgent ---
class ModularGAIAAgent:
    """GAIA-compliant agent with robust reasoning, tool chaining, RAG, and output normalization."""
    def __init__(self, api_url=DEFAULT_API_URL, tool_registry=None, context_files=None):
        self.api_url = api_url
        self.tools = ToolRegistry(tool_registry or TOOL_REGISTRY)
        self.reasoning_trace = []
        self.file_cache = set(os.listdir('.'))
        self.context_files = context_files or []

    def fetch_questions(self, from_api=True, questions_path="Hugging Face Questions"):
        """Fetch questions from API or local file."""
        try:
            if from_api:
                r = requests.get(f"{self.api_url}/questions")
                r.raise_for_status()
                return r.json()
            else:
                with open(questions_path) as f:
                    data = f.read()
                start = data.find("[")
                end = data.rfind("]") + 1
                questions = json.loads(data[start:end])
                return questions
        except Exception as e:
            logger.error(f"fetch_questions error: {e}")
            return []

    def cached_download_file(self, file_id, file_name):
        """Download file from GAIA API with caching to avoid redundant downloads."""
        cache_file = "file_download_cache.json"
        cache = load_cache(cache_file)
        if file_id in cache:
            local_path = cache[file_id]
            if os.path.exists(local_path):
                logger.info(f"Using cached file for {file_id}: {local_path}")
                return local_path
        local_path = self.download_file(file_id, file_name)
        if local_path:
            cache[file_id] = local_path
            save_cache(cache_file, cache)
        return local_path

    def download_file(self, file_id, file_name):
        return self.cached_download_file(file_id, file_name)

    def detect_file_type(self, file_name):
        """Detect file type using magic and extension as fallback."""
        file_type = detect_file_type_magic(file_name)
        if file_type == 'unknown':
            ext = os.path.splitext(file_name)[-1].lower()
            if ext in ['.mp3', '.wav', '.flac']:
                return 'audio'
            elif ext in ['.png', '.jpg', '.jpeg', '.bmp']:
                return 'image'
            elif ext in ['.py']:
                return 'code'
            elif ext in ['.xlsx']:
                return 'excel'
            elif ext in ['.csv']:
                return 'csv'
            elif ext in ['.json']:
                return 'json'
            elif ext in ['.txt', '.md']:
                return 'text'
            else:
                return 'unknown'
        return file_type

    def analyze_file(self, file_name, file_type):
        """Analyze file and return context for the question."""
        try:
            if file_type == 'audio':
                transcript = self.tools.get('asr_transcribe')(file_name)
                self.reasoning_trace.append(f"Transcribed audio: {transcript[:100]}...")
                return transcript
            elif file_type == 'image':
                caption = self.tools.get('image_caption')(file_name)
                self.reasoning_trace.append(f"Image caption: {caption}")
                return caption
            elif file_type == 'code':
                result = self.tools.get('code_analysis')(file_name)
                self.reasoning_trace.append(f"Code analysis result: {result}")
                return result
            elif file_type == 'excel':
                wb = openpyxl.load_workbook(file_name)
                ws = wb.active
                data = list(ws.values)
                headers = data[0]
                table = [dict(zip(headers, row)) for row in data[1:]]
                self.reasoning_trace.append(f"Excel table loaded: {table[:2]}...")
                return table
            elif file_type == 'csv':
                df = pd.read_csv(file_name)
                table = df.to_dict(orient='records')
                self.reasoning_trace.append(f"CSV table loaded: {table[:2]}...")
                return table
            elif file_type == 'json':
                with open(file_name) as f:
                    data = json.load(f)
                self.reasoning_trace.append(f"JSON loaded: {str(data)[:100]}...")
                return data
            elif file_type == 'text':
                with open(file_name) as f:
                    text = f.read()
                self.reasoning_trace.append(f"Text loaded: {text[:100]}...")
                return text
            else:
                self.reasoning_trace.append(f"Unknown file type: {file_name}")
                logger.warning(f"Unknown file type: {file_name}")
                return None
        except Exception as e:
            logger.error(f"analyze_file error: {e}")
            self.reasoning_trace.append(f"Analyze file error: {e}")
            return None

    def answer_question(self, question_obj):
        self.reasoning_trace = []
        q = question_obj["question"]
        file_name = question_obj.get("file_name", "")
        file_content = None
        file_type = None
        if file_name:
            file_id = file_name.split('.')[0]
            local_file = self.download_file(file_id, file_name)
            if local_file:
                file_type = self.detect_file_type(local_file)
                file_content = self.analyze_file(local_file, file_type)
            else:
                self.reasoning_trace.append(f"Failed to download file {file_name}, proceeding without file content.")
                logger.warning(f"File download failed for {file_id}, proceeding without file content.")
        # RAG: retrieve context if needed
        rag_context = ''
        if self.context_files:
            try:
                rag_context = retrieve_context(q, self.context_files)
                self.reasoning_trace.append(f"Retrieved context: {rag_context[:100]}...")
            except Exception as e:
                logger.error(f"RAG context retrieval error: {e}")
                self.reasoning_trace.append(f"Context retrieval error: {e}, proceeding without context.")
        # Plan tools using enhanced reasoning planner
        try:
            tool_names = reasoning_planner(q, file_type if file_type else '', self.tools)
        except Exception as e:
            logger.error(f"Reasoning planner error: {e}")
            self.reasoning_trace.append(f"Planning error: {e}, falling back to default tool.")
            tool_names = ['llama3_chat']
        context = rag_context
        answer = ''
        max_retries = 2  # Retry mechanism for tool failures
        # Iterative Thought-Action-Observation cycle (up to 5 iterations for Level 1)
        for i, tool_name in enumerate(tool_names):
            tool = self.tools.get(tool_name)
            if not tool:
                self.reasoning_trace.append(f"Tool {tool_name} not found, skipping.")
                continue
            retries = 0
            while retries < max_retries:
                try:
                    logger.info(f"Step {i+1}/{len(tool_names)}: Using tool: {tool_name} | Question: {q[:50]}... | Context: {str(context)[:100]}... | Attempt {retries+1}/{max_retries}")
                    self.reasoning_trace.append(f"Step {i+1}: Using tool {tool_name} (Attempt {retries+1})")
                    if tool_name == 'web_search_duckduckgo':
                        context = tool(q)
                        self.reasoning_trace.append(f"Web search results: {context[:100]}...")
                    elif tool_name == 'table_qa' and file_content:
                        answer = tool(q, file_content)
                        self.reasoning_trace.append(f"Table QA result: {answer}")
                    elif tool_name in ['asr_transcribe', 'image_caption', 'code_analysis'] and file_name:
                        context = tool(file_name)
                        self.reasoning_trace.append(f"File analysis ({tool_name}): {context[:100]}...")
                    elif tool_name == 'youtube_video_qa':
                        answer = tool(q, q)
                        self.reasoning_trace.append(f"YouTube QA result: {answer}")
                    elif tool_name in ['chess_move_analysis'] and file_name:
                        answer = tool(file_name, q)
                        self.reasoning_trace.append(f"Chess move analysis result: {answer}")
                    elif tool_name in ['botanical_classification']:
                        answer = tool(q)
                        self.reasoning_trace.append(f"Botanical classification result: {answer}")
                    else:  # LLM like llama3_chat
                        if context:
                            prompt = build_prompt(context, q)
                            answer = tool(prompt)
                            self.reasoning_trace.append(f"LLM response with context: {answer[:100]}...")
                        else:
                            answer = tool(q)
                            self.reasoning_trace.append(f"LLM direct response: {answer[:100]}...")
                    # Observation: Check if answer seems complete or needs further steps
                    if answer and len(answer.split()) > 2:  # Basic check for meaningful answer
                        self.reasoning_trace.append(f"Answer seems meaningful after step {i+1}, stopping iteration.")
                        break
                    elif i < len(tool_names) - 1:
                        self.reasoning_trace.append(f"Answer incomplete after step {i+1}, proceeding to next tool.")
                    break  # Exit retry loop on success
                except Exception as e:
                    logger.error(f"Tool {tool_name} error on attempt {retries+1}: {e}")
                    self.reasoning_trace.append(f"Tool {tool_name} error on attempt {retries+1}: {e}")
                    retries += 1
                    if retries >= max_retries:
                        self.reasoning_trace.append(f"Max retries reached for {tool_name}, skipping to next tool or defaulting.")
                        if i == len(tool_names) - 1:  # Last tool failed
                            answer = "Unable to answer due to tool failures."
                        break
                    time.sleep(1)  # Brief delay before retry
        self.reasoning_trace.append(f"Tools used: {tool_names}")
        self.reasoning_trace.append(f"Final answer: {answer}")
        return gaia_normalize_answer(answer), self.reasoning_trace

    def answer_question_manual(self, question, file_upload, context_files):
        """Answer a manually input question with optional file and context."""
        try:
            # Handle file upload if provided
            file_name = None
            if file_upload:
                file_name = file_upload.name
                # Simulate GAIA file handling
                file_id = os.path.basename(file_name).split('.')[0]
                local_file = self.download_file(file_id, file_name)
                if local_file:
                    file_type = self.detect_file_type(local_file)
                    file_content = self.analyze_file(local_file, file_type)
                else:
                    file_content = None
            else:
                file_content = None
            # Handle context files if provided
            self.context_files = [f.name for f in context_files] if context_files else []
            # Create a mock question object
            question_obj = {
                "question": question,
                "file_name": file_name if file_name else ""
            }
            answer, trace = self.answer_question(question_obj)
            return answer, "\n".join(trace)
        except Exception as e:
            logger.error(f"Manual question error: {e}")
            return f"Error: {e}", f"Error occurred: {e}"

    def process_batch(self, token):
        """Process a batch of questions with progress updates."""
        try:
            questions = self.fetch_questions(token)
            if not questions:
                return "0/0 questions processed - fetch failed", []
            total = len(questions)
            results = []
            for i, q in enumerate(questions):
                try:
                    answer, trace = self.answer_question(q)
                    results.append({
                        "task_id": q["task_id"],
                        "question": q["question"],
                        "answer": answer,
                        "trace": trace
                    })
                    logger.info(f"Batch progress: {i+1}/{total} questions processed")
                    yield f"{i+1}/{total} questions processed", results
                except Exception as e:
                    logger.error(f"Batch processing error for question {i+1}: {e}")
                    results.append({
                        "task_id": q.get("task_id", "unknown"),
                        "question": q.get("question", "unknown"),
                        "answer": "Error processing",
                        "trace": [str(e)]
                    })
                    yield f"{i+1}/{total} questions processed", results
            logger.info(f"Batch processing complete: {total}/{total} questions processed")
        except Exception as e:
            logger.error(f"Batch processing overall error: {e}")
            yield "Error in batch processing", []

# --- Build Gradio Interface using Blocks (Maintaining Original Architecture) ---
with gr.Blocks() as demo:
    gr.Markdown("# Smart Agent Evaluation Runner")
    gr.Markdown("""
        **Instructions:**
        1. Clone this space, define your agent logic, tools, packages, etc.
        2. Log in to Hugging Face.
        3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
    """)

    gr.LoginButton()
    run_button = gr.Button("Run Evaluation & Submit All Answers")
    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)

    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])

if __name__ == "__main__":
    print("Launching Gradio Interface for Smart Agent Evaluation...")
    demo.launch(debug=True, share=False)

# Define a wrapper to ensure compatibility
def run_and_submit_all_wrapper(profile: gr.OAuthProfile | None):
    return run_and_submit_all(profile)

# Update run_and_submit_all to use the enhanced ModularGAIAAgent
def run_and_submit_all(profile: gr.OAuthProfile | None):
    space_id = os.getenv("SPACE_ID")
    if profile:
        username = profile.username
        print(f"User logged in: {username}")
    else:
        return "Please Login to Hugging Face with the button.", None

    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"

    agent = ModularGAIAAgent(api_url=DEFAULT_API_URL)
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"

    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
    except Exception as e:
        return f"Error fetching questions: {e}", None

    results_log = []
    answers_payload = []
    correct_answers = 0

    for item in questions_data:
        task_id = item.get("task_id")
        question_text = item.get("question")
        if not task_id or not question_text:
            continue

        submitted_answer, trace = agent.answer_question(item)
        answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
        results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

    if not answers_payload:
        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)

    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")

    try:
        response = requests.post(submit_url, json=submission_data, timeout=60)
        response.raise_for_status()
        result_data = response.json()
        final_status = (
            f"Submission Successful!\n"
            f"User: {result_data.get('username')}\n"
            f"Overall Score: {result_data.get('score', 'N/A')}% "
            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
            f"Message: {result_data.get('message', 'No message received.')}"
        )
        results_df = pd.DataFrame(results_log)
        return final_status, results_df
    except Exception as e:
        return f"Submission Failed: {e}", pd.DataFrame(results_log)