Phidata / file_handler.py
tamirgz's picture
first add
1be3350
raw
history blame
8.31 kB
import os
import requests
from typing import Optional, List, Set
from urllib.parse import urlparse, unquote
from pathlib import Path
from datetime import datetime
from save_report import save_markdown_report
from phi.utils.log import logger
class FileHandler:
"""Handler for downloading and saving files discovered during web crawling."""
SUPPORTED_EXTENSIONS = {
'pdf': 'application/pdf',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'csv': 'text/csv'
}
# Common browser headers
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def __init__(self):
# Get the report directory for the current date
self.report_dir, _ = save_markdown_report()
self.downloaded_files: Set[str] = set()
self.file_metadata: List[dict] = []
self.failed_downloads: List[dict] = [] # Track failed downloads
# Create a subdirectory for downloaded files
self.downloads_dir = os.path.join(self.report_dir, 'downloads')
os.makedirs(self.downloads_dir, exist_ok=True)
# Create a metadata file to track downloaded files
self.metadata_file = os.path.join(self.downloads_dir, 'files_metadata.md')
def is_supported_file(self, url: str) -> bool:
"""Check if the URL points to a supported file type."""
parsed_url = urlparse(url)
extension = os.path.splitext(parsed_url.path)[1].lower().lstrip('.')
return extension in self.SUPPORTED_EXTENSIONS
def get_filename_from_url(self, url: str, content_type: Optional[str] = None) -> str:
"""Generate a safe filename from the URL."""
# Get the filename from the URL
parsed_url = urlparse(url)
filename = os.path.basename(unquote(parsed_url.path))
# If no filename in URL, create one based on content type
if not filename:
extension = next(
(ext for ext, mime in self.SUPPORTED_EXTENSIONS.items()
if mime == content_type),
'unknown'
)
filename = f"downloaded_file.{extension}"
# Ensure filename is safe and unique
safe_filename = "".join(c for c in filename if c.isalnum() or c in '._-')
base, ext = os.path.splitext(safe_filename)
# Add number suffix if file exists
counter = 1
while os.path.exists(os.path.join(self.downloads_dir, safe_filename)):
safe_filename = f"{base}_{counter}{ext}"
counter += 1
return safe_filename
def download_file(self, url: str, source_page: str = None) -> Optional[str]:
"""
Download a file from the URL and save it to the downloads directory.
Returns the path to the saved file if successful, None otherwise.
"""
if url in self.downloaded_files:
logger.info(f"File already downloaded: {url}")
return None
try:
# Create a session to maintain headers across redirects
session = requests.Session()
session.headers.update(self.HEADERS)
# First make a HEAD request to check content type and size
head_response = session.head(url, timeout=10, allow_redirects=True)
head_response.raise_for_status()
content_type = head_response.headers.get('content-type', '').lower().split(';')[0]
content_length = int(head_response.headers.get('content-length', 0))
# Check if content type is supported and size is reasonable (less than 100MB)
if not any(mime in content_type for mime in self.SUPPORTED_EXTENSIONS.values()):
logger.warning(f"Unsupported content type: {content_type} for URL: {url}")
return None
if content_length > 100 * 1024 * 1024: # 100MB limit
logger.warning(f"File too large ({content_length} bytes) for URL: {url}")
return None
# Make the actual download request
response = session.get(url, timeout=30, stream=True)
response.raise_for_status()
# Generate safe filename
filename = self.get_filename_from_url(url, content_type)
file_path = os.path.join(self.downloads_dir, filename)
# Save the file
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# Record metadata
metadata = {
'filename': filename,
'source_url': url,
'source_page': source_page,
'content_type': content_type,
'download_time': datetime.now().isoformat(),
'file_size': os.path.getsize(file_path)
}
self.file_metadata.append(metadata)
# Update metadata file
self._update_metadata_file()
self.downloaded_files.add(url)
logger.info(f"Successfully downloaded: {url} to {file_path}")
return file_path
except requests.RequestException as e:
error_info = {
'url': url,
'source_page': source_page,
'error': str(e),
'time': datetime.now().isoformat()
}
self.failed_downloads.append(error_info)
self._update_metadata_file() # Update metadata including failed downloads
logger.error(f"Error downloading file from {url}: {str(e)}")
return None
except Exception as e:
logger.error(f"Unexpected error while downloading {url}: {str(e)}")
return None
def _update_metadata_file(self):
"""Update the metadata markdown file with information about downloaded files."""
try:
with open(self.metadata_file, 'w', encoding='utf-8') as f:
f.write("# Downloaded Files Metadata\n\n")
# Successful downloads
if self.file_metadata:
f.write("## Successfully Downloaded Files\n\n")
for metadata in self.file_metadata:
f.write(f"### {metadata['filename']}\n")
f.write(f"- Source URL: {metadata['source_url']}\n")
if metadata['source_page']:
f.write(f"- Found on page: {metadata['source_page']}\n")
f.write(f"- Content Type: {metadata['content_type']}\n")
f.write(f"- Download Time: {metadata['download_time']}\n")
f.write(f"- File Size: {metadata['file_size']} bytes\n\n")
# Failed downloads
if self.failed_downloads:
f.write("## Failed Downloads\n\n")
for failed in self.failed_downloads:
f.write(f"### {failed['url']}\n")
if failed['source_page']:
f.write(f"- Found on page: {failed['source_page']}\n")
f.write(f"- Error: {failed['error']}\n")
f.write(f"- Time: {failed['time']}\n\n")
except Exception as e:
logger.error(f"Error updating metadata file: {str(e)}")
def get_downloaded_files(self) -> List[str]:
"""Return a list of all downloaded file paths."""
return [os.path.join(self.downloads_dir, f)
for f in os.listdir(self.downloads_dir)
if os.path.isfile(os.path.join(self.downloads_dir, f))]