File size: 18,930 Bytes
26eff0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#!/usr/bin/env python3
"""
πŸš€ Enhanced GAIA Tools - Complete Tool Arsenal
Additional specialized tools for 100% GAIA benchmark compliance
"""

import os
import logging
import tempfile
import requests
from typing import Dict, Any, List, Optional

logger = logging.getLogger(__name__)

class EnhancedGAIATools:
    """πŸ› οΈ Complete toolkit for GAIA benchmark excellence"""
    
    def __init__(self, hf_token: str = None, openai_key: str = None):
        self.hf_token = hf_token or os.getenv('HF_TOKEN')
        self.openai_key = openai_key or os.getenv('OPENAI_API_KEY')
        
    # === ENHANCED DOCUMENT PROCESSING ===
    
    def read_docx(self, file_path: str) -> str:
        """πŸ“„ Read Microsoft Word documents"""
        try:
            import docx2txt
            text = docx2txt.process(file_path)
            logger.info(f"πŸ“„ DOCX read: {len(text)} characters")
            return text
        except ImportError:
            logger.warning("⚠️ docx2txt not available. Install python-docx.")
            return "❌ DOCX reading unavailable. Install python-docx."
        except Exception as e:
            logger.error(f"❌ DOCX reading error: {e}")
            return f"❌ DOCX reading failed: {e}"

    def read_excel(self, file_path: str, sheet_name: str = None) -> str:
        """πŸ“Š Read Excel spreadsheets"""
        try:
            import pandas as pd
            if sheet_name:
                df = pd.read_excel(file_path, sheet_name=sheet_name)
            else:
                df = pd.read_excel(file_path)
            
            # Convert to readable format
            result = f"Excel data ({df.shape[0]} rows, {df.shape[1]} columns):\n"
            result += df.to_string(max_rows=50, max_cols=10)
            
            logger.info(f"πŸ“Š Excel read: {df.shape}")
            return result
        except ImportError:
            logger.warning("⚠️ pandas not available for Excel reading.")
            return "❌ Excel reading unavailable. Install pandas and openpyxl."
        except Exception as e:
            logger.error(f"❌ Excel reading error: {e}")
            return f"❌ Excel reading failed: {e}"

    def read_csv(self, file_path: str) -> str:
        """πŸ“‹ Read CSV files"""
        try:
            import pandas as pd
            df = pd.read_csv(file_path)
            
            # Convert to readable format
            result = f"CSV data ({df.shape[0]} rows, {df.shape[1]} columns):\n"
            result += df.head(20).to_string()
            
            if df.shape[0] > 20:
                result += f"\n... (showing first 20 of {df.shape[0]} rows)"
            
            logger.info(f"πŸ“‹ CSV read: {df.shape}")
            return result
        except ImportError:
            logger.warning("⚠️ pandas not available for CSV reading.")
            return "❌ CSV reading unavailable. Install pandas."
        except Exception as e:
            logger.error(f"❌ CSV reading error: {e}")
            return f"❌ CSV reading failed: {e}"

    def read_text_file(self, file_path: str, encoding: str = 'utf-8') -> str:
        """πŸ“ Read plain text files with encoding detection"""
        try:
            # Try UTF-8 first
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                # Try other common encodings
                encodings = ['latin-1', 'cp1252', 'ascii']
                content = None
                for enc in encodings:
                    try:
                        with open(file_path, 'r', encoding=enc) as f:
                            content = f.read()
                        break
                    except UnicodeDecodeError:
                        continue
                
                if content is None:
                    return "❌ Unable to decode text file with common encodings"
            
            logger.info(f"πŸ“ Text file read: {len(content)} characters")
            return content[:10000] + ("..." if len(content) > 10000 else "")
        except Exception as e:
            logger.error(f"❌ Text file reading error: {e}")
            return f"❌ Text file reading failed: {e}"

    def extract_archive(self, file_path: str) -> str:
        """πŸ“¦ Extract and list archive contents (ZIP, RAR, etc.)"""
        try:
            import zipfile
            import os
            
            if file_path.endswith('.zip'):
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    file_list = zip_ref.namelist()
                    extract_dir = os.path.join(os.path.dirname(file_path), 'extracted')
                    os.makedirs(extract_dir, exist_ok=True)
                    zip_ref.extractall(extract_dir)
                    
                    result = f"πŸ“¦ ZIP archive extracted to {extract_dir}\n"
                    result += f"Contents ({len(file_list)} files):\n"
                    result += "\n".join(file_list[:20])
                    
                    if len(file_list) > 20:
                        result += f"\n... (showing first 20 of {len(file_list)} files)"
                    
                    logger.info(f"πŸ“¦ ZIP extracted: {len(file_list)} files")
                    return result
            else:
                return f"❌ Unsupported archive format: {file_path}"
        except Exception as e:
            logger.error(f"❌ Archive extraction error: {e}")
            return f"❌ Archive extraction failed: {e}"

    # === ENHANCED WEB BROWSING ===
    
    def browse_with_js(self, url: str) -> str:
        """🌐 Enhanced web browsing with JavaScript support (when available)"""
        try:
            # Try playwright for dynamic content
            from playwright.sync_api import sync_playwright
            
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url, timeout=15000)
                page.wait_for_timeout(2000)  # Wait for JS to load
                content = page.content()
                browser.close()
                
                # Parse content
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(content, 'html.parser')
                
                # Remove scripts and styles
                for script in soup(["script", "style"]):
                    script.decompose()
                
                text = soup.get_text()
                # Clean up whitespace
                lines = (line.strip() for line in text.splitlines())
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                clean_text = ' '.join(chunk for chunk in chunks if chunk)
                
                logger.info(f"🌐 JS-enabled browsing: {url} - {len(clean_text)} chars")
                return clean_text[:5000] + ("..." if len(clean_text) > 5000 else "")
                
        except ImportError:
            logger.info("⚠️ Playwright not available, using requests fallback")
            return self._fallback_browse(url)
        except Exception as e:
            logger.warning(f"⚠️ JS browsing failed: {e}, falling back to basic")
            return self._fallback_browse(url)

    def _fallback_browse(self, url: str) -> str:
        """🌐 Fallback web browsing using requests"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate',
                'Connection': 'keep-alive',
            }
            
            response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
            response.raise_for_status()
            
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove scripts and styles
            for script in soup(["script", "style"]):
                script.decompose()
            
            text = soup.get_text()
            # Clean up whitespace
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            clean_text = ' '.join(chunk for chunk in chunks if chunk)
            
            logger.info(f"🌐 Basic browsing: {url} - {len(clean_text)} chars")
            return clean_text[:5000] + ("..." if len(clean_text) > 5000 else "")
            
        except Exception as e:
            logger.error(f"❌ Web browsing error: {e}")
            return f"❌ Web browsing failed: {e}"

    # === ENHANCED GAIA FILE HANDLING ===
    
    def download_gaia_file(self, task_id: str, file_name: str = None) -> str:
        """πŸ“₯ Enhanced GAIA file download with comprehensive format support"""
        try:
            # GAIA API endpoint for file downloads
            api_base = "https://agents-course-unit4-scoring.hf.space"
            file_url = f"{api_base}/files/{task_id}"
            
            logger.info(f"πŸ“₯ Downloading GAIA file for task: {task_id}")
            
            headers = {
                'User-Agent': 'GAIA-Agent/1.0 (Enhanced)',
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate',
            }
            
            response = requests.get(file_url, headers=headers, timeout=30, stream=True)
            
            if response.status_code == 200:
                # Determine file extension from headers or filename
                content_type = response.headers.get('content-type', '')
                content_disposition = response.headers.get('content-disposition', '')
                
                # Extract filename from Content-Disposition header
                if file_name:
                    filename = file_name
                elif 'filename=' in content_disposition:
                    filename = content_disposition.split('filename=')[1].strip('"\'')
                else:
                    # Guess extension from content type
                    extension_map = {
                        'image/jpeg': '.jpg',
                        'image/png': '.png',
                        'image/gif': '.gif',
                        'application/pdf': '.pdf',
                        'text/plain': '.txt',
                        'application/json': '.json',
                        'text/csv': '.csv',
                        'application/vnd.ms-excel': '.xlsx',
                        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
                        'application/msword': '.docx',
                        'video/mp4': '.mp4',
                        'audio/mpeg': '.mp3',
                        'audio/wav': '.wav',
                        'application/zip': '.zip',
                    }
                    extension = extension_map.get(content_type, '.tmp')
                    filename = f"gaia_file_{task_id}{extension}"
                
                # Save file
                import tempfile
                import os
                
                temp_dir = tempfile.gettempdir()
                filepath = os.path.join(temp_dir, filename)
                
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                file_size = os.path.getsize(filepath)
                logger.info(f"πŸ“₯ GAIA file downloaded: {filepath} ({file_size} bytes)")
                
                # Automatically process based on file type
                return self.process_downloaded_file(filepath, task_id)
                
            else:
                error_msg = f"❌ GAIA file download failed: HTTP {response.status_code}"
                logger.error(error_msg)
                return error_msg
                
        except Exception as e:
            error_msg = f"❌ GAIA file download error: {e}"
            logger.error(error_msg)
            return error_msg

    def process_downloaded_file(self, filepath: str, task_id: str) -> str:
        """πŸ“‹ Process downloaded GAIA files based on their type"""
        try:
            import os
            filename = os.path.basename(filepath)
            file_ext = os.path.splitext(filename)[1].lower()
            
            logger.info(f"πŸ“‹ Processing GAIA file: {filename} (type: {file_ext})")
            
            result = f"πŸ“ GAIA File: {filename} (Task: {task_id})\n\n"
            
            # Process based on file type
            if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
                # Image file - return file path for image analysis
                result += f"πŸ–ΌοΈ Image file ready for analysis: {filepath}\n"
                result += f"File type: {file_ext}, Path: {filepath}"
                
            elif file_ext == '.pdf':
                # PDF document
                pdf_content = self.read_pdf(filepath)
                result += f"πŸ“„ PDF Content:\n{pdf_content}\n"
                
            elif file_ext in ['.txt', '.md', '.py', '.js', '.html', '.css']:
                # Text files
                text_content = self.read_text_file(filepath)
                result += f"πŸ“ Text Content:\n{text_content}\n"
                
            elif file_ext in ['.csv']:
                # CSV files
                csv_content = self.read_csv(filepath)
                result += f"πŸ“Š CSV Data:\n{csv_content}\n"
                
            elif file_ext in ['.xlsx', '.xls']:
                # Excel files
                excel_content = self.read_excel(filepath)
                result += f"πŸ“ˆ Excel Data:\n{excel_content}\n"
                
            elif file_ext in ['.docx']:
                # Word documents
                docx_content = self.read_docx(filepath)
                result += f"πŸ“„ Word Document:\n{docx_content}\n"
                
            elif file_ext in ['.mp4', '.avi', '.mov', '.wmv']:
                # Video files - return path for video analysis
                result += f"πŸŽ₯ Video file ready for analysis: {filepath}\n"
                result += f"File type: {file_ext}, Path: {filepath}"
                
            elif file_ext in ['.mp3', '.wav', '.m4a', '.flac']:
                # Audio files - return path for audio analysis
                result += f"🎡 Audio file ready for analysis: {filepath}\n"
                result += f"File type: {file_ext}, Path: {filepath}"
                
            elif file_ext in ['.zip', '.rar']:
                # Archive files
                archive_result = self.extract_archive(filepath)
                result += f"πŸ“¦ Archive Contents:\n{archive_result}\n"
                
            elif file_ext in ['.json']:
                # JSON files
                try:
                    import json
                    with open(filepath, 'r') as f:
                        json_data = json.load(f)
                    result += f"πŸ“‹ JSON Data:\n{json.dumps(json_data, indent=2)[:2000]}\n"
                except Exception as e:
                    result += f"❌ JSON parsing error: {e}\n"
                    
            else:
                # Unknown file type - try as text
                try:
                    text_content = self.read_text_file(filepath)
                    result += f"πŸ“„ Raw Content:\n{text_content}\n"
                except:
                    result += f"❌ Unsupported file type: {file_ext}\n"
            
            # Add file metadata
            file_size = os.path.getsize(filepath)
            result += f"\nπŸ“Š File Info: {file_size} bytes, Path: {filepath}"
            
            return result
            
        except Exception as e:
            error_msg = f"❌ File processing error: {e}"
            logger.error(error_msg)
            return error_msg

    def read_pdf(self, file_path: str) -> str:
        """πŸ“„ Read PDF with fallback to raw text"""
        try:
            import PyPDF2
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page_num, page in enumerate(pdf_reader.pages):
                    try:
                        page_text = page.extract_text()
                        text += page_text + "\n"
                    except Exception as e:
                        text += f"[Page {page_num + 1} extraction failed: {e}]\n"
                
                logger.info(f"πŸ“„ PDF read: {len(pdf_reader.pages)} pages, {len(text)} chars")
                return text
        except ImportError:
            return "❌ PDF reading unavailable. Install PyPDF2."
        except Exception as e:
            logger.error(f"❌ PDF reading error: {e}")
            return f"❌ PDF reading failed: {e}"

    # === UTILITY METHODS ===
    
    def get_available_tools(self) -> List[str]:
        """πŸ“‹ List all available enhanced tools"""
        return [
            "read_docx", "read_excel", "read_csv", "read_text_file", "extract_archive",
            "browse_with_js", "download_gaia_file", "process_downloaded_file", 
            "read_pdf"
        ]

    def tool_description(self, tool_name: str) -> str:
        """πŸ“– Get description of a specific tool"""
        descriptions = {
            "read_docx": "πŸ“„ Read Microsoft Word documents (.docx)",
            "read_excel": "πŸ“Š Read Excel spreadsheets (.xlsx, .xls)",
            "read_csv": "πŸ“‹ Read CSV files with pandas",
            "read_text_file": "πŸ“ Read text files with encoding detection",
            "extract_archive": "πŸ“¦ Extract ZIP archives and list contents",
            "browse_with_js": "🌐 Enhanced web browsing with JavaScript support",
            "download_gaia_file": "πŸ“₯ Download GAIA benchmark files via API",
            "process_downloaded_file": "πŸ“‹ Automatically process files by type",
            "read_pdf": "πŸ“„ Read PDF documents with PyPDF2",
        }
        return descriptions.get(tool_name, f"❓ Unknown tool: {tool_name}")

# Test function
def test_enhanced_tools():
    """πŸ§ͺ Test enhanced GAIA tools"""
    print("πŸ§ͺ Testing Enhanced GAIA Tools")
    
    tools = EnhancedGAIATools()
    
    print("\nπŸ“‹ Available tools:")
    for tool in tools.get_available_tools():
        print(f"  - {tool}: {tools.tool_description(tool)}")
    
    print("\nβœ… Enhanced tools ready for GAIA benchmark!")

if __name__ == "__main__":
    test_enhanced_tools()