Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Feb 16

Commit

09565a0

verified ·

1 Parent(s): d35064f

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -42

app.py CHANGED Viewed

@@ -582,7 +582,7 @@ class DownloadManager:
             logger.error(f"Error getting sublinks: {e}")
             return []
-    async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
         if not custom_ext_list:
             custom_ext_list = []
@@ -591,7 +591,7 @@ class DownloadManager:
         file_count_text = st.empty()
         try:
-        # Initialize base domains with the original URL
             self.get_base_domain(url)
             # First step: Get all sublinks
@@ -614,63 +614,59 @@ class DownloadManager:
             async def process_sublink(sublink, index):
                 async with sem:
                     try:
-                        progress = index/total_links
                         progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
                         progress_bar.progress(progress)
                         async with async_timeout.timeout(timeout):
-                            # First check if sublink itself leads to a file
                             real_url, headers = await self.get_real_url(sublink)
                             content_type = headers.get('content-type', '').lower()
-                            # If sublink is a file
-                            if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
                                 return [{
                                     'url': real_url,
                                     'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                                     'size': await self.get_file_size(real_url),
                                     'metadata': {}
                                 }]
-                        # If sublink is a page, check for download links
                             await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
                             content = await self.page.content()
                             soup = BeautifulSoup(content, 'html.parser')
-                        # Find potential download links
-                            links = []
                             for a in soup.find_all('a', href=True):
                                 href = a['href'].strip()
-                                if 'download' in href.lower() or 'visit.php' in href.lower():
-                                    links.append(href)
-                        # Process each potential download link
-                            sublink_files = []
-                            for href in links:
-                                try:
-                                    if not href.startswith('http'):
-                                        parsed_base = urlparse(real_url)
-                                        base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-                                        href = base_url + ('/' if not href.startswith('/') else '') + href
-                                    final_url, _ = await self.get_real_url(href)
-                                # Add file if it's a valid download
-                                    if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
-                                       any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
-                                        sublink_files.append({
-                                            'url': final_url,
-                                            'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
-                                            'size': await self.get_file_size(final_url),
-                                            'metadata': {}
-                                        })
-                                except Exception as e:
-                                    logger.error(f"Error processing download link {href}: {e}")
                                     continue
                             if sublink_files:
                                 logger.info(f"Found {len(sublink_files)} files at {real_url}")
                                 st.write(f"Found {len(sublink_files)} files at {real_url}")
                             return sublink_files
                     except asyncio.TimeoutError:
@@ -681,7 +677,7 @@ class DownloadManager:
                         return []
             # Process all sublinks concurrently
-            tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
             sub_results = await asyncio.gather(*tasks)
             # Combine all results
@@ -689,7 +685,7 @@ class DownloadManager:
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
-        # Make results unique based on URLs
             seen_urls = set()
             unique_files = []
             for f in all_files:
@@ -698,11 +694,11 @@ class DownloadManager:
                     unique_files.append(f)
             final_count = len(unique_files)
-            progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
-            # Sort files by name for consistency
             unique_files.sort(key=lambda x: x['filename'].lower())
             return unique_files
@@ -719,6 +715,7 @@ class DownloadManager:
                 file_count_text.empty()
             except:
                 pass
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True

             logger.error(f"Error getting sublinks: {e}")
             return []
+        async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
         if not custom_ext_list:
             custom_ext_list = []
         file_count_text = st.empty()
         try:
+            # Initialize base domains with the original URL
             self.get_base_domain(url)
             # First step: Get all sublinks
             async def process_sublink(sublink, index):
                 async with sem:
                     try:
+                        progress = (index) / total_links
                         progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
                         progress_bar.progress(progress)
                         async with async_timeout.timeout(timeout):
+                            # Get the final URL and headers for this sublink
                             real_url, headers = await self.get_real_url(sublink)
                             content_type = headers.get('content-type', '').lower()
+                            # If the sublink itself is a downloadable file, return it
+                            if any(x in content_type for x in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
                                 return [{
                                     'url': real_url,
                                     'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                                     'size': await self.get_file_size(real_url),
                                     'metadata': {}
                                 }]
+                            # Otherwise, treat it as a webpage and search for file links
                             await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
                             content = await self.page.content()
                             soup = BeautifulSoup(content, 'html.parser')
+                            # Define default and custom file extensions
+                            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
+                                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
+                            custom_exts = [ext.strip().lower() for ext in custom_ext_list if ext.strip()]
+                            file_exts = set(default_exts + custom_exts)
+                            sublink_files = []
+                            # Iterate over all anchor tags found on the page
                             for a in soup.find_all('a', href=True):
                                 href = a['href'].strip()
+                                if not href:
                                     continue
+                                # Convert any relative URL to an absolute URL
+                                full_url = urljoin(real_url, href)
+                                if any(full_url.lower().endswith(ext) for ext in file_exts):
+                                    final_url, _ = await self.get_real_url(full_url)
+                                    file_info = {
+                                        'url': final_url,
+                                        'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
+                                        'size': await self.get_file_size(final_url),
+                                        'metadata': {}
+                                    }
+                                    if final_url.lower().endswith('.pdf'):
+                                        file_info['metadata'] = await self.get_pdf_metadata(final_url)
+                                    sublink_files.append(file_info)
                             if sublink_files:
                                 logger.info(f"Found {len(sublink_files)} files at {real_url}")
                                 st.write(f"Found {len(sublink_files)} files at {real_url}")
                             return sublink_files
                     except asyncio.TimeoutError:
                         return []
             # Process all sublinks concurrently
+            tasks = [process_sublink(sublink, i + 1) for i, sublink in enumerate(sublinks)]
             sub_results = await asyncio.gather(*tasks)
             # Combine all results
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
+            # Remove duplicates based on URL
             seen_urls = set()
             unique_files = []
             for f in all_files:
                     unique_files.append(f)
             final_count = len(unique_files)
+            progress_text.text("Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
+            # Sort files by filename for consistency
             unique_files.sort(key=lambda x: x['filename'].lower())
             return unique_files
                 file_count_text.empty()
             except:
                 pass
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True