|
import os |
|
import requests |
|
from typing import Optional, List, Set |
|
from urllib.parse import urlparse, unquote |
|
from pathlib import Path |
|
from datetime import datetime |
|
from save_report import save_markdown_report |
|
from phi.utils.log import logger |
|
|
|
|
|
class FileHandler: |
|
"""Handler for downloading and saving files discovered during web crawling.""" |
|
|
|
SUPPORTED_EXTENSIONS = { |
|
'pdf': 'application/pdf', |
|
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'csv': 'text/csv' |
|
} |
|
|
|
|
|
HEADERS = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'DNT': '1', |
|
'Connection': 'keep-alive', |
|
'Upgrade-Insecure-Requests': '1' |
|
} |
|
|
|
def __init__(self): |
|
|
|
self.report_dir, _ = save_markdown_report() |
|
self.downloaded_files: Set[str] = set() |
|
self.file_metadata: List[dict] = [] |
|
self.failed_downloads: List[dict] = [] |
|
|
|
|
|
self.downloads_dir = os.path.join(self.report_dir, 'downloads') |
|
os.makedirs(self.downloads_dir, exist_ok=True) |
|
|
|
|
|
self.metadata_file = os.path.join(self.downloads_dir, 'files_metadata.md') |
|
|
|
def is_supported_file(self, url: str) -> bool: |
|
"""Check if the URL points to a supported file type.""" |
|
parsed_url = urlparse(url) |
|
extension = os.path.splitext(parsed_url.path)[1].lower().lstrip('.') |
|
return extension in self.SUPPORTED_EXTENSIONS |
|
|
|
def get_filename_from_url(self, url: str, content_type: Optional[str] = None) -> str: |
|
"""Generate a safe filename from the URL.""" |
|
|
|
parsed_url = urlparse(url) |
|
filename = os.path.basename(unquote(parsed_url.path)) |
|
|
|
|
|
if not filename: |
|
extension = next( |
|
(ext for ext, mime in self.SUPPORTED_EXTENSIONS.items() |
|
if mime == content_type), |
|
'unknown' |
|
) |
|
filename = f"downloaded_file.{extension}" |
|
|
|
|
|
safe_filename = "".join(c for c in filename if c.isalnum() or c in '._-') |
|
base, ext = os.path.splitext(safe_filename) |
|
|
|
|
|
counter = 1 |
|
while os.path.exists(os.path.join(self.downloads_dir, safe_filename)): |
|
safe_filename = f"{base}_{counter}{ext}" |
|
counter += 1 |
|
|
|
return safe_filename |
|
|
|
def download_file(self, url: str, source_page: str = None) -> Optional[str]: |
|
""" |
|
Download a file from the URL and save it to the downloads directory. |
|
Returns the path to the saved file if successful, None otherwise. |
|
""" |
|
if url in self.downloaded_files: |
|
logger.info(f"File already downloaded: {url}") |
|
return None |
|
|
|
try: |
|
|
|
session = requests.Session() |
|
session.headers.update(self.HEADERS) |
|
|
|
|
|
head_response = session.head(url, timeout=10, allow_redirects=True) |
|
head_response.raise_for_status() |
|
|
|
content_type = head_response.headers.get('content-type', '').lower().split(';')[0] |
|
content_length = int(head_response.headers.get('content-length', 0)) |
|
|
|
|
|
if not any(mime in content_type for mime in self.SUPPORTED_EXTENSIONS.values()): |
|
logger.warning(f"Unsupported content type: {content_type} for URL: {url}") |
|
return None |
|
|
|
if content_length > 100 * 1024 * 1024: |
|
logger.warning(f"File too large ({content_length} bytes) for URL: {url}") |
|
return None |
|
|
|
|
|
response = session.get(url, timeout=30, stream=True) |
|
response.raise_for_status() |
|
|
|
|
|
filename = self.get_filename_from_url(url, content_type) |
|
file_path = os.path.join(self.downloads_dir, filename) |
|
|
|
|
|
with open(file_path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
metadata = { |
|
'filename': filename, |
|
'source_url': url, |
|
'source_page': source_page, |
|
'content_type': content_type, |
|
'download_time': datetime.now().isoformat(), |
|
'file_size': os.path.getsize(file_path) |
|
} |
|
self.file_metadata.append(metadata) |
|
|
|
|
|
self._update_metadata_file() |
|
|
|
self.downloaded_files.add(url) |
|
logger.info(f"Successfully downloaded: {url} to {file_path}") |
|
return file_path |
|
|
|
except requests.RequestException as e: |
|
error_info = { |
|
'url': url, |
|
'source_page': source_page, |
|
'error': str(e), |
|
'time': datetime.now().isoformat() |
|
} |
|
self.failed_downloads.append(error_info) |
|
self._update_metadata_file() |
|
logger.error(f"Error downloading file from {url}: {str(e)}") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Unexpected error while downloading {url}: {str(e)}") |
|
return None |
|
|
|
def _update_metadata_file(self): |
|
"""Update the metadata markdown file with information about downloaded files.""" |
|
try: |
|
with open(self.metadata_file, 'w', encoding='utf-8') as f: |
|
f.write("# Downloaded Files Metadata\n\n") |
|
|
|
|
|
if self.file_metadata: |
|
f.write("## Successfully Downloaded Files\n\n") |
|
for metadata in self.file_metadata: |
|
f.write(f"### {metadata['filename']}\n") |
|
f.write(f"- Source URL: {metadata['source_url']}\n") |
|
if metadata['source_page']: |
|
f.write(f"- Found on page: {metadata['source_page']}\n") |
|
f.write(f"- Content Type: {metadata['content_type']}\n") |
|
f.write(f"- Download Time: {metadata['download_time']}\n") |
|
f.write(f"- File Size: {metadata['file_size']} bytes\n\n") |
|
|
|
|
|
if self.failed_downloads: |
|
f.write("## Failed Downloads\n\n") |
|
for failed in self.failed_downloads: |
|
f.write(f"### {failed['url']}\n") |
|
if failed['source_page']: |
|
f.write(f"- Found on page: {failed['source_page']}\n") |
|
f.write(f"- Error: {failed['error']}\n") |
|
f.write(f"- Time: {failed['time']}\n\n") |
|
|
|
except Exception as e: |
|
logger.error(f"Error updating metadata file: {str(e)}") |
|
|
|
def get_downloaded_files(self) -> List[str]: |
|
"""Return a list of all downloaded file paths.""" |
|
return [os.path.join(self.downloads_dir, f) |
|
for f in os.listdir(self.downloads_dir) |
|
if os.path.isfile(os.path.join(self.downloads_dir, f))] |
|
|