import os import requests from typing import Optional, List, Set from urllib.parse import urlparse, unquote from pathlib import Path from datetime import datetime from save_report import save_markdown_report from phi.utils.log import logger class FileHandler: """Handler for downloading and saving files discovered during web crawling.""" SUPPORTED_EXTENSIONS = { 'pdf': 'application/pdf', 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'csv': 'text/csv' } # Common browser headers HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } def __init__(self): # Get the report directory for the current date self.report_dir, _ = save_markdown_report() self.downloaded_files: Set[str] = set() self.file_metadata: List[dict] = [] self.failed_downloads: List[dict] = [] # Track failed downloads # Create a subdirectory for downloaded files self.downloads_dir = os.path.join(self.report_dir, 'downloads') os.makedirs(self.downloads_dir, exist_ok=True) # Create a metadata file to track downloaded files self.metadata_file = os.path.join(self.downloads_dir, 'files_metadata.md') def is_supported_file(self, url: str) -> bool: """Check if the URL points to a supported file type.""" parsed_url = urlparse(url) extension = os.path.splitext(parsed_url.path)[1].lower().lstrip('.') return extension in self.SUPPORTED_EXTENSIONS def get_filename_from_url(self, url: str, content_type: Optional[str] = None) -> str: """Generate a safe filename from the URL.""" # Get the filename from the URL parsed_url = urlparse(url) filename = os.path.basename(unquote(parsed_url.path)) # If no filename in URL, create one based on content type if not filename: extension = next( (ext for ext, mime in self.SUPPORTED_EXTENSIONS.items() if mime == content_type), 'unknown' ) filename = f"downloaded_file.{extension}" # Ensure filename is safe and unique safe_filename = "".join(c for c in filename if c.isalnum() or c in '._-') base, ext = os.path.splitext(safe_filename) # Add number suffix if file exists counter = 1 while os.path.exists(os.path.join(self.downloads_dir, safe_filename)): safe_filename = f"{base}_{counter}{ext}" counter += 1 return safe_filename def download_file(self, url: str, source_page: str = None) -> Optional[str]: """ Download a file from the URL and save it to the downloads directory. Returns the path to the saved file if successful, None otherwise. """ if url in self.downloaded_files: logger.info(f"File already downloaded: {url}") return None try: # Create a session to maintain headers across redirects session = requests.Session() session.headers.update(self.HEADERS) # First make a HEAD request to check content type and size head_response = session.head(url, timeout=10, allow_redirects=True) head_response.raise_for_status() content_type = head_response.headers.get('content-type', '').lower().split(';')[0] content_length = int(head_response.headers.get('content-length', 0)) # Check if content type is supported and size is reasonable (less than 100MB) if not any(mime in content_type for mime in self.SUPPORTED_EXTENSIONS.values()): logger.warning(f"Unsupported content type: {content_type} for URL: {url}") return None if content_length > 100 * 1024 * 1024: # 100MB limit logger.warning(f"File too large ({content_length} bytes) for URL: {url}") return None # Make the actual download request response = session.get(url, timeout=30, stream=True) response.raise_for_status() # Generate safe filename filename = self.get_filename_from_url(url, content_type) file_path = os.path.join(self.downloads_dir, filename) # Save the file with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) # Record metadata metadata = { 'filename': filename, 'source_url': url, 'source_page': source_page, 'content_type': content_type, 'download_time': datetime.now().isoformat(), 'file_size': os.path.getsize(file_path) } self.file_metadata.append(metadata) # Update metadata file self._update_metadata_file() self.downloaded_files.add(url) logger.info(f"Successfully downloaded: {url} to {file_path}") return file_path except requests.RequestException as e: error_info = { 'url': url, 'source_page': source_page, 'error': str(e), 'time': datetime.now().isoformat() } self.failed_downloads.append(error_info) self._update_metadata_file() # Update metadata including failed downloads logger.error(f"Error downloading file from {url}: {str(e)}") return None except Exception as e: logger.error(f"Unexpected error while downloading {url}: {str(e)}") return None def _update_metadata_file(self): """Update the metadata markdown file with information about downloaded files.""" try: with open(self.metadata_file, 'w', encoding='utf-8') as f: f.write("# Downloaded Files Metadata\n\n") # Successful downloads if self.file_metadata: f.write("## Successfully Downloaded Files\n\n") for metadata in self.file_metadata: f.write(f"### {metadata['filename']}\n") f.write(f"- Source URL: {metadata['source_url']}\n") if metadata['source_page']: f.write(f"- Found on page: {metadata['source_page']}\n") f.write(f"- Content Type: {metadata['content_type']}\n") f.write(f"- Download Time: {metadata['download_time']}\n") f.write(f"- File Size: {metadata['file_size']} bytes\n\n") # Failed downloads if self.failed_downloads: f.write("## Failed Downloads\n\n") for failed in self.failed_downloads: f.write(f"### {failed['url']}\n") if failed['source_page']: f.write(f"- Found on page: {failed['source_page']}\n") f.write(f"- Error: {failed['error']}\n") f.write(f"- Time: {failed['time']}\n\n") except Exception as e: logger.error(f"Error updating metadata file: {str(e)}") def get_downloaded_files(self) -> List[str]: """Return a list of all downloaded file paths.""" return [os.path.join(self.downloads_dir, f) for f in os.listdir(self.downloads_dir) if os.path.isfile(os.path.join(self.downloads_dir, f))]