File size: 8,308 Bytes
1be3350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import requests
from typing import Optional, List, Set
from urllib.parse import urlparse, unquote
from pathlib import Path
from datetime import datetime
from save_report import save_markdown_report
from phi.utils.log import logger


class FileHandler:
    """Handler for downloading and saving files discovered during web crawling."""
    
    SUPPORTED_EXTENSIONS = {
        'pdf': 'application/pdf',
        'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'csv': 'text/csv'
    }
    
    # Common browser headers
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }

    def __init__(self):
        # Get the report directory for the current date
        self.report_dir, _ = save_markdown_report()
        self.downloaded_files: Set[str] = set()
        self.file_metadata: List[dict] = []
        self.failed_downloads: List[dict] = []  # Track failed downloads
        
        # Create a subdirectory for downloaded files
        self.downloads_dir = os.path.join(self.report_dir, 'downloads')
        os.makedirs(self.downloads_dir, exist_ok=True)
        
        # Create a metadata file to track downloaded files
        self.metadata_file = os.path.join(self.downloads_dir, 'files_metadata.md')

    def is_supported_file(self, url: str) -> bool:
        """Check if the URL points to a supported file type."""
        parsed_url = urlparse(url)
        extension = os.path.splitext(parsed_url.path)[1].lower().lstrip('.')
        return extension in self.SUPPORTED_EXTENSIONS
    
    def get_filename_from_url(self, url: str, content_type: Optional[str] = None) -> str:
        """Generate a safe filename from the URL."""
        # Get the filename from the URL
        parsed_url = urlparse(url)
        filename = os.path.basename(unquote(parsed_url.path))
        
        # If no filename in URL, create one based on content type
        if not filename:
            extension = next(
                (ext for ext, mime in self.SUPPORTED_EXTENSIONS.items() 
                 if mime == content_type),
                'unknown'
            )
            filename = f"downloaded_file.{extension}"
        
        # Ensure filename is safe and unique
        safe_filename = "".join(c for c in filename if c.isalnum() or c in '._-')
        base, ext = os.path.splitext(safe_filename)
        
        # Add number suffix if file exists
        counter = 1
        while os.path.exists(os.path.join(self.downloads_dir, safe_filename)):
            safe_filename = f"{base}_{counter}{ext}"
            counter += 1
        
        return safe_filename
    
    def download_file(self, url: str, source_page: str = None) -> Optional[str]:
        """
        Download a file from the URL and save it to the downloads directory.
        Returns the path to the saved file if successful, None otherwise.
        """
        if url in self.downloaded_files:
            logger.info(f"File already downloaded: {url}")
            return None
        
        try:
            # Create a session to maintain headers across redirects
            session = requests.Session()
            session.headers.update(self.HEADERS)
            
            # First make a HEAD request to check content type and size
            head_response = session.head(url, timeout=10, allow_redirects=True)
            head_response.raise_for_status()
            
            content_type = head_response.headers.get('content-type', '').lower().split(';')[0]
            content_length = int(head_response.headers.get('content-length', 0))
            
            # Check if content type is supported and size is reasonable (less than 100MB)
            if not any(mime in content_type for mime in self.SUPPORTED_EXTENSIONS.values()):
                logger.warning(f"Unsupported content type: {content_type} for URL: {url}")
                return None
            
            if content_length > 100 * 1024 * 1024:  # 100MB limit
                logger.warning(f"File too large ({content_length} bytes) for URL: {url}")
                return None
            
            # Make the actual download request
            response = session.get(url, timeout=30, stream=True)
            response.raise_for_status()
            
            # Generate safe filename
            filename = self.get_filename_from_url(url, content_type)
            file_path = os.path.join(self.downloads_dir, filename)
            
            # Save the file
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            
            # Record metadata
            metadata = {
                'filename': filename,
                'source_url': url,
                'source_page': source_page,
                'content_type': content_type,
                'download_time': datetime.now().isoformat(),
                'file_size': os.path.getsize(file_path)
            }
            self.file_metadata.append(metadata)
            
            # Update metadata file
            self._update_metadata_file()
            
            self.downloaded_files.add(url)
            logger.info(f"Successfully downloaded: {url} to {file_path}")
            return file_path
            
        except requests.RequestException as e:
            error_info = {
                'url': url,
                'source_page': source_page,
                'error': str(e),
                'time': datetime.now().isoformat()
            }
            self.failed_downloads.append(error_info)
            self._update_metadata_file()  # Update metadata including failed downloads
            logger.error(f"Error downloading file from {url}: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error while downloading {url}: {str(e)}")
            return None
    
    def _update_metadata_file(self):
        """Update the metadata markdown file with information about downloaded files."""
        try:
            with open(self.metadata_file, 'w', encoding='utf-8') as f:
                f.write("# Downloaded Files Metadata\n\n")
                
                # Successful downloads
                if self.file_metadata:
                    f.write("## Successfully Downloaded Files\n\n")
                    for metadata in self.file_metadata:
                        f.write(f"### {metadata['filename']}\n")
                        f.write(f"- Source URL: {metadata['source_url']}\n")
                        if metadata['source_page']:
                            f.write(f"- Found on page: {metadata['source_page']}\n")
                        f.write(f"- Content Type: {metadata['content_type']}\n")
                        f.write(f"- Download Time: {metadata['download_time']}\n")
                        f.write(f"- File Size: {metadata['file_size']} bytes\n\n")
                
                # Failed downloads
                if self.failed_downloads:
                    f.write("## Failed Downloads\n\n")
                    for failed in self.failed_downloads:
                        f.write(f"### {failed['url']}\n")
                        if failed['source_page']:
                            f.write(f"- Found on page: {failed['source_page']}\n")
                        f.write(f"- Error: {failed['error']}\n")
                        f.write(f"- Time: {failed['time']}\n\n")
                
        except Exception as e:
            logger.error(f"Error updating metadata file: {str(e)}")

    def get_downloaded_files(self) -> List[str]:
        """Return a list of all downloaded file paths."""
        return [os.path.join(self.downloads_dir, f) 
                for f in os.listdir(self.downloads_dir)
                if os.path.isfile(os.path.join(self.downloads_dir, f))]