Phidata / file_handler.py
tamirgz's picture
first add
1be3350
import os
import requests
from typing import Optional, List, Set
from urllib.parse import urlparse, unquote
from pathlib import Path
from datetime import datetime
from save_report import save_markdown_report
from phi.utils.log import logger
class FileHandler:
"""Handler for downloading and saving files discovered during web crawling."""
SUPPORTED_EXTENSIONS = {
'pdf': 'application/pdf',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'csv': 'text/csv'
}
# Common browser headers
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def __init__(self):
# Get the report directory for the current date
self.report_dir, _ = save_markdown_report()
self.downloaded_files: Set[str] = set()
self.file_metadata: List[dict] = []
self.failed_downloads: List[dict] = [] # Track failed downloads
# Create a subdirectory for downloaded files
self.downloads_dir = os.path.join(self.report_dir, 'downloads')
os.makedirs(self.downloads_dir, exist_ok=True)
# Create a metadata file to track downloaded files
self.metadata_file = os.path.join(self.downloads_dir, 'files_metadata.md')
def is_supported_file(self, url: str) -> bool:
"""Check if the URL points to a supported file type."""
parsed_url = urlparse(url)
extension = os.path.splitext(parsed_url.path)[1].lower().lstrip('.')
return extension in self.SUPPORTED_EXTENSIONS
def get_filename_from_url(self, url: str, content_type: Optional[str] = None) -> str:
"""Generate a safe filename from the URL."""
# Get the filename from the URL
parsed_url = urlparse(url)
filename = os.path.basename(unquote(parsed_url.path))
# If no filename in URL, create one based on content type
if not filename:
extension = next(
(ext for ext, mime in self.SUPPORTED_EXTENSIONS.items()
if mime == content_type),
'unknown'
)
filename = f"downloaded_file.{extension}"
# Ensure filename is safe and unique
safe_filename = "".join(c for c in filename if c.isalnum() or c in '._-')
base, ext = os.path.splitext(safe_filename)
# Add number suffix if file exists
counter = 1
while os.path.exists(os.path.join(self.downloads_dir, safe_filename)):
safe_filename = f"{base}_{counter}{ext}"
counter += 1
return safe_filename
def download_file(self, url: str, source_page: str = None) -> Optional[str]:
"""
Download a file from the URL and save it to the downloads directory.
Returns the path to the saved file if successful, None otherwise.
"""
if url in self.downloaded_files:
logger.info(f"File already downloaded: {url}")
return None
try:
# Create a session to maintain headers across redirects
session = requests.Session()
session.headers.update(self.HEADERS)
# First make a HEAD request to check content type and size
head_response = session.head(url, timeout=10, allow_redirects=True)
head_response.raise_for_status()
content_type = head_response.headers.get('content-type', '').lower().split(';')[0]
content_length = int(head_response.headers.get('content-length', 0))
# Check if content type is supported and size is reasonable (less than 100MB)
if not any(mime in content_type for mime in self.SUPPORTED_EXTENSIONS.values()):
logger.warning(f"Unsupported content type: {content_type} for URL: {url}")
return None
if content_length > 100 * 1024 * 1024: # 100MB limit
logger.warning(f"File too large ({content_length} bytes) for URL: {url}")
return None
# Make the actual download request
response = session.get(url, timeout=30, stream=True)
response.raise_for_status()
# Generate safe filename
filename = self.get_filename_from_url(url, content_type)
file_path = os.path.join(self.downloads_dir, filename)
# Save the file
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# Record metadata
metadata = {
'filename': filename,
'source_url': url,
'source_page': source_page,
'content_type': content_type,
'download_time': datetime.now().isoformat(),
'file_size': os.path.getsize(file_path)
}
self.file_metadata.append(metadata)
# Update metadata file
self._update_metadata_file()
self.downloaded_files.add(url)
logger.info(f"Successfully downloaded: {url} to {file_path}")
return file_path
except requests.RequestException as e:
error_info = {
'url': url,
'source_page': source_page,
'error': str(e),
'time': datetime.now().isoformat()
}
self.failed_downloads.append(error_info)
self._update_metadata_file() # Update metadata including failed downloads
logger.error(f"Error downloading file from {url}: {str(e)}")
return None
except Exception as e:
logger.error(f"Unexpected error while downloading {url}: {str(e)}")
return None
def _update_metadata_file(self):
"""Update the metadata markdown file with information about downloaded files."""
try:
with open(self.metadata_file, 'w', encoding='utf-8') as f:
f.write("# Downloaded Files Metadata\n\n")
# Successful downloads
if self.file_metadata:
f.write("## Successfully Downloaded Files\n\n")
for metadata in self.file_metadata:
f.write(f"### {metadata['filename']}\n")
f.write(f"- Source URL: {metadata['source_url']}\n")
if metadata['source_page']:
f.write(f"- Found on page: {metadata['source_page']}\n")
f.write(f"- Content Type: {metadata['content_type']}\n")
f.write(f"- Download Time: {metadata['download_time']}\n")
f.write(f"- File Size: {metadata['file_size']} bytes\n\n")
# Failed downloads
if self.failed_downloads:
f.write("## Failed Downloads\n\n")
for failed in self.failed_downloads:
f.write(f"### {failed['url']}\n")
if failed['source_page']:
f.write(f"- Found on page: {failed['source_page']}\n")
f.write(f"- Error: {failed['error']}\n")
f.write(f"- Time: {failed['time']}\n\n")
except Exception as e:
logger.error(f"Error updating metadata file: {str(e)}")
def get_downloaded_files(self) -> List[str]:
"""Return a list of all downloaded file paths."""
return [os.path.join(self.downloads_dir, f)
for f in os.listdir(self.downloads_dir)
if os.path.isfile(os.path.join(self.downloads_dir, f))]