Spaces:
Runtime error
Runtime error
#!/usr/bin/env python3 | |
""" | |
π Enhanced GAIA Tools - Complete Tool Arsenal | |
Additional specialized tools for 100% GAIA benchmark compliance | |
""" | |
import os | |
import logging | |
import tempfile | |
import requests | |
from typing import Dict, Any, List, Optional | |
logger = logging.getLogger(__name__) | |
class EnhancedGAIATools: | |
"""π οΈ Complete toolkit for GAIA benchmark excellence""" | |
def __init__(self, hf_token: str = None, openai_key: str = None): | |
self.hf_token = hf_token or os.getenv('HF_TOKEN') | |
self.openai_key = openai_key or os.getenv('OPENAI_API_KEY') | |
# === ENHANCED DOCUMENT PROCESSING === | |
def read_docx(self, file_path: str) -> str: | |
"""π Read Microsoft Word documents""" | |
try: | |
import docx2txt | |
text = docx2txt.process(file_path) | |
logger.info(f"π DOCX read: {len(text)} characters") | |
return text | |
except ImportError: | |
logger.warning("β οΈ docx2txt not available. Install python-docx.") | |
return "β DOCX reading unavailable. Install python-docx." | |
except Exception as e: | |
logger.error(f"β DOCX reading error: {e}") | |
return f"β DOCX reading failed: {e}" | |
def read_excel(self, file_path: str, sheet_name: str = None) -> str: | |
"""π Read Excel spreadsheets""" | |
try: | |
import pandas as pd | |
if sheet_name: | |
df = pd.read_excel(file_path, sheet_name=sheet_name) | |
else: | |
df = pd.read_excel(file_path) | |
# Convert to readable format | |
result = f"Excel data ({df.shape[0]} rows, {df.shape[1]} columns):\n" | |
result += df.to_string(max_rows=50, max_cols=10) | |
logger.info(f"π Excel read: {df.shape}") | |
return result | |
except ImportError: | |
logger.warning("β οΈ pandas not available for Excel reading.") | |
return "β Excel reading unavailable. Install pandas and openpyxl." | |
except Exception as e: | |
logger.error(f"β Excel reading error: {e}") | |
return f"β Excel reading failed: {e}" | |
def read_csv(self, file_path: str) -> str: | |
"""π Read CSV files""" | |
try: | |
import pandas as pd | |
df = pd.read_csv(file_path) | |
# Convert to readable format | |
result = f"CSV data ({df.shape[0]} rows, {df.shape[1]} columns):\n" | |
result += df.head(20).to_string() | |
if df.shape[0] > 20: | |
result += f"\n... (showing first 20 of {df.shape[0]} rows)" | |
logger.info(f"π CSV read: {df.shape}") | |
return result | |
except ImportError: | |
logger.warning("β οΈ pandas not available for CSV reading.") | |
return "β CSV reading unavailable. Install pandas." | |
except Exception as e: | |
logger.error(f"β CSV reading error: {e}") | |
return f"β CSV reading failed: {e}" | |
def read_text_file(self, file_path: str, encoding: str = 'utf-8') -> str: | |
"""π Read plain text files with encoding detection""" | |
try: | |
# Try UTF-8 first | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
except UnicodeDecodeError: | |
# Try other common encodings | |
encodings = ['latin-1', 'cp1252', 'ascii'] | |
content = None | |
for enc in encodings: | |
try: | |
with open(file_path, 'r', encoding=enc) as f: | |
content = f.read() | |
break | |
except UnicodeDecodeError: | |
continue | |
if content is None: | |
return "β Unable to decode text file with common encodings" | |
logger.info(f"π Text file read: {len(content)} characters") | |
return content[:10000] + ("..." if len(content) > 10000 else "") | |
except Exception as e: | |
logger.error(f"β Text file reading error: {e}") | |
return f"β Text file reading failed: {e}" | |
def extract_archive(self, file_path: str) -> str: | |
"""π¦ Extract and list archive contents (ZIP, RAR, etc.)""" | |
try: | |
import zipfile | |
import os | |
if file_path.endswith('.zip'): | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
file_list = zip_ref.namelist() | |
extract_dir = os.path.join(os.path.dirname(file_path), 'extracted') | |
os.makedirs(extract_dir, exist_ok=True) | |
zip_ref.extractall(extract_dir) | |
result = f"π¦ ZIP archive extracted to {extract_dir}\n" | |
result += f"Contents ({len(file_list)} files):\n" | |
result += "\n".join(file_list[:20]) | |
if len(file_list) > 20: | |
result += f"\n... (showing first 20 of {len(file_list)} files)" | |
logger.info(f"π¦ ZIP extracted: {len(file_list)} files") | |
return result | |
else: | |
return f"β Unsupported archive format: {file_path}" | |
except Exception as e: | |
logger.error(f"β Archive extraction error: {e}") | |
return f"β Archive extraction failed: {e}" | |
# === ENHANCED WEB BROWSING === | |
def browse_with_js(self, url: str) -> str: | |
"""π Enhanced web browsing with JavaScript support (when available)""" | |
try: | |
# Try playwright for dynamic content | |
from playwright.sync_api import sync_playwright | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=True) | |
page = browser.new_page() | |
page.goto(url, timeout=15000) | |
page.wait_for_timeout(2000) # Wait for JS to load | |
content = page.content() | |
browser.close() | |
# Parse content | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(content, 'html.parser') | |
# Remove scripts and styles | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text() | |
# Clean up whitespace | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
clean_text = ' '.join(chunk for chunk in chunks if chunk) | |
logger.info(f"π JS-enabled browsing: {url} - {len(clean_text)} chars") | |
return clean_text[:5000] + ("..." if len(clean_text) > 5000 else "") | |
except ImportError: | |
logger.info("β οΈ Playwright not available, using requests fallback") | |
return self._fallback_browse(url) | |
except Exception as e: | |
logger.warning(f"β οΈ JS browsing failed: {e}, falling back to basic") | |
return self._fallback_browse(url) | |
def _fallback_browse(self, url: str) -> str: | |
"""π Fallback web browsing using requests""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'Connection': 'keep-alive', | |
} | |
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) | |
response.raise_for_status() | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove scripts and styles | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text() | |
# Clean up whitespace | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
clean_text = ' '.join(chunk for chunk in chunks if chunk) | |
logger.info(f"π Basic browsing: {url} - {len(clean_text)} chars") | |
return clean_text[:5000] + ("..." if len(clean_text) > 5000 else "") | |
except Exception as e: | |
logger.error(f"β Web browsing error: {e}") | |
return f"β Web browsing failed: {e}" | |
# === ENHANCED GAIA FILE HANDLING === | |
def download_gaia_file(self, task_id: str, file_name: str = None) -> str: | |
"""π₯ Enhanced GAIA file download with comprehensive format support""" | |
try: | |
# GAIA API endpoint for file downloads | |
api_base = "https://agents-course-unit4-scoring.hf.space" | |
file_url = f"{api_base}/files/{task_id}" | |
logger.info(f"π₯ Downloading GAIA file for task: {task_id}") | |
headers = { | |
'User-Agent': 'GAIA-Agent/1.0 (Enhanced)', | |
'Accept': '*/*', | |
'Accept-Encoding': 'gzip, deflate', | |
} | |
response = requests.get(file_url, headers=headers, timeout=30, stream=True) | |
if response.status_code == 200: | |
# Determine file extension from headers or filename | |
content_type = response.headers.get('content-type', '') | |
content_disposition = response.headers.get('content-disposition', '') | |
# Extract filename from Content-Disposition header | |
if file_name: | |
filename = file_name | |
elif 'filename=' in content_disposition: | |
filename = content_disposition.split('filename=')[1].strip('"\'') | |
else: | |
# Guess extension from content type | |
extension_map = { | |
'image/jpeg': '.jpg', | |
'image/png': '.png', | |
'image/gif': '.gif', | |
'application/pdf': '.pdf', | |
'text/plain': '.txt', | |
'application/json': '.json', | |
'text/csv': '.csv', | |
'application/vnd.ms-excel': '.xlsx', | |
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', | |
'application/msword': '.docx', | |
'video/mp4': '.mp4', | |
'audio/mpeg': '.mp3', | |
'audio/wav': '.wav', | |
'application/zip': '.zip', | |
} | |
extension = extension_map.get(content_type, '.tmp') | |
filename = f"gaia_file_{task_id}{extension}" | |
# Save file | |
import tempfile | |
import os | |
temp_dir = tempfile.gettempdir() | |
filepath = os.path.join(temp_dir, filename) | |
with open(filepath, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
file_size = os.path.getsize(filepath) | |
logger.info(f"π₯ GAIA file downloaded: {filepath} ({file_size} bytes)") | |
# Automatically process based on file type | |
return self.process_downloaded_file(filepath, task_id) | |
else: | |
error_msg = f"β GAIA file download failed: HTTP {response.status_code}" | |
logger.error(error_msg) | |
return error_msg | |
except Exception as e: | |
error_msg = f"β GAIA file download error: {e}" | |
logger.error(error_msg) | |
return error_msg | |
def process_downloaded_file(self, filepath: str, task_id: str) -> str: | |
"""π Process downloaded GAIA files based on their type""" | |
try: | |
import os | |
filename = os.path.basename(filepath) | |
file_ext = os.path.splitext(filename)[1].lower() | |
logger.info(f"π Processing GAIA file: {filename} (type: {file_ext})") | |
result = f"π GAIA File: {filename} (Task: {task_id})\n\n" | |
# Process based on file type | |
if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']: | |
# Image file - return file path for image analysis | |
result += f"πΌοΈ Image file ready for analysis: {filepath}\n" | |
result += f"File type: {file_ext}, Path: {filepath}" | |
elif file_ext == '.pdf': | |
# PDF document | |
pdf_content = self.read_pdf(filepath) | |
result += f"π PDF Content:\n{pdf_content}\n" | |
elif file_ext in ['.txt', '.md', '.py', '.js', '.html', '.css']: | |
# Text files | |
text_content = self.read_text_file(filepath) | |
result += f"π Text Content:\n{text_content}\n" | |
elif file_ext in ['.csv']: | |
# CSV files | |
csv_content = self.read_csv(filepath) | |
result += f"π CSV Data:\n{csv_content}\n" | |
elif file_ext in ['.xlsx', '.xls']: | |
# Excel files | |
excel_content = self.read_excel(filepath) | |
result += f"π Excel Data:\n{excel_content}\n" | |
elif file_ext in ['.docx']: | |
# Word documents | |
docx_content = self.read_docx(filepath) | |
result += f"π Word Document:\n{docx_content}\n" | |
elif file_ext in ['.mp4', '.avi', '.mov', '.wmv']: | |
# Video files - return path for video analysis | |
result += f"π₯ Video file ready for analysis: {filepath}\n" | |
result += f"File type: {file_ext}, Path: {filepath}" | |
elif file_ext in ['.mp3', '.wav', '.m4a', '.flac']: | |
# Audio files - return path for audio analysis | |
result += f"π΅ Audio file ready for analysis: {filepath}\n" | |
result += f"File type: {file_ext}, Path: {filepath}" | |
elif file_ext in ['.zip', '.rar']: | |
# Archive files | |
archive_result = self.extract_archive(filepath) | |
result += f"π¦ Archive Contents:\n{archive_result}\n" | |
elif file_ext in ['.json']: | |
# JSON files | |
try: | |
import json | |
with open(filepath, 'r') as f: | |
json_data = json.load(f) | |
result += f"π JSON Data:\n{json.dumps(json_data, indent=2)[:2000]}\n" | |
except Exception as e: | |
result += f"β JSON parsing error: {e}\n" | |
else: | |
# Unknown file type - try as text | |
try: | |
text_content = self.read_text_file(filepath) | |
result += f"π Raw Content:\n{text_content}\n" | |
except: | |
result += f"β Unsupported file type: {file_ext}\n" | |
# Add file metadata | |
file_size = os.path.getsize(filepath) | |
result += f"\nπ File Info: {file_size} bytes, Path: {filepath}" | |
return result | |
except Exception as e: | |
error_msg = f"β File processing error: {e}" | |
logger.error(error_msg) | |
return error_msg | |
def read_pdf(self, file_path: str) -> str: | |
"""π Read PDF with fallback to raw text""" | |
try: | |
import PyPDF2 | |
with open(file_path, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
text = "" | |
for page_num, page in enumerate(pdf_reader.pages): | |
try: | |
page_text = page.extract_text() | |
text += page_text + "\n" | |
except Exception as e: | |
text += f"[Page {page_num + 1} extraction failed: {e}]\n" | |
logger.info(f"π PDF read: {len(pdf_reader.pages)} pages, {len(text)} chars") | |
return text | |
except ImportError: | |
return "β PDF reading unavailable. Install PyPDF2." | |
except Exception as e: | |
logger.error(f"β PDF reading error: {e}") | |
return f"β PDF reading failed: {e}" | |
# === UTILITY METHODS === | |
def get_available_tools(self) -> List[str]: | |
"""π List all available enhanced tools""" | |
return [ | |
"read_docx", "read_excel", "read_csv", "read_text_file", "extract_archive", | |
"browse_with_js", "download_gaia_file", "process_downloaded_file", | |
"read_pdf" | |
] | |
def tool_description(self, tool_name: str) -> str: | |
"""π Get description of a specific tool""" | |
descriptions = { | |
"read_docx": "π Read Microsoft Word documents (.docx)", | |
"read_excel": "π Read Excel spreadsheets (.xlsx, .xls)", | |
"read_csv": "π Read CSV files with pandas", | |
"read_text_file": "π Read text files with encoding detection", | |
"extract_archive": "π¦ Extract ZIP archives and list contents", | |
"browse_with_js": "π Enhanced web browsing with JavaScript support", | |
"download_gaia_file": "π₯ Download GAIA benchmark files via API", | |
"process_downloaded_file": "π Automatically process files by type", | |
"read_pdf": "π Read PDF documents with PyPDF2", | |
} | |
return descriptions.get(tool_name, f"β Unknown tool: {tool_name}") | |
# Test function | |
def test_enhanced_tools(): | |
"""π§ͺ Test enhanced GAIA tools""" | |
print("π§ͺ Testing Enhanced GAIA Tools") | |
tools = EnhancedGAIATools() | |
print("\nπ Available tools:") | |
for tool in tools.get_available_tools(): | |
print(f" - {tool}: {tools.tool_description(tool)}") | |
print("\nβ Enhanced tools ready for GAIA benchmark!") | |
if __name__ == "__main__": | |
test_enhanced_tools() |