Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Feb 16

Commit

d35064f

verified ·

1 Parent(s): 9ad3033

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -46

app.py CHANGED Viewed

@@ -585,41 +585,32 @@ class DownloadManager:
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
-            # Initialize base domains with the original URL
             self.get_base_domain(url)
-            # Get the real initial URL
-            real_url, _ = await self.get_real_url(url)
-            # Search main page
-            progress_text.text("Analyzing main page...")
-            main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
-            initial_count = len(main_files)
-            file_count_text.text(f"Found {initial_count} files on main page")
-            # Get and search sublinks
-            progress_text.text("Getting sublinks...")
-            sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
-            if not sublinks:
                 progress_bar.progress(1.0)
-                return main_files
-            # Process sublinks
-            all_files = main_files.copy()
             # Create semaphore for concurrent processing
             sem = asyncio.Semaphore(10)
             async def process_sublink(sublink, index):
                 async with sem:
                     try:
@@ -627,42 +618,85 @@ class DownloadManager:
                         progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
                         progress_bar.progress(progress)
-                        # Set timeout for this sublink
                         async with async_timeout.timeout(timeout):
-                            # Get real URL before processing
-                            real_sublink, _ = await self.get_real_url(sublink)
-                            sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list)
-                            if sub_files:
-                                logger.info(f"Found {len(sub_files)} files at {real_sublink}")
-                                st.write(f"Found {len(sub_files)} files at {real_sublink}")
-                            return sub_files
                     except asyncio.TimeoutError:
                         logger.warning(f"Timeout processing sublink: {sublink}")
                         return []
                     except Exception as e:
                         logger.error(f"Error processing sublink {sublink}: {e}")
                         return []
-            # Process sublinks with concurrent tasks
             tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
             sub_results = await asyncio.gather(*tasks)
             # Combine all results
             for sub_files in sub_results:
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
-            # Make results unique based on URLs
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
@@ -670,16 +704,14 @@ class DownloadManager:
             # Sort files by name for consistency
             unique_files.sort(key=lambda x: x['filename'].lower())
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
-            # Clean up progress indicators after a delay
             await asyncio.sleep(2)
             try:
                 progress_text.empty()

     async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
+        # Initialize base domains with the original URL
             self.get_base_domain(url)
+            # First step: Get all sublinks
+            progress_text.text("Getting all sublinks from main page...")
+            sublinks = await self.get_sublinks(url, limit=sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
+            if total_links == 0:
                 progress_bar.progress(1.0)
+                # If no sublinks, try direct file search
+                return await self.extract_downloadable_files(url, custom_ext_list)
+            # Process main page and sublinks
+            all_files = []
             # Create semaphore for concurrent processing
             sem = asyncio.Semaphore(10)
             async def process_sublink(sublink, index):
                 async with sem:
                     try:
                         progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
                         progress_bar.progress(progress)
                         async with async_timeout.timeout(timeout):
+                            # First check if sublink itself leads to a file
+                            real_url, headers = await self.get_real_url(sublink)
+                            content_type = headers.get('content-type', '').lower()
+                            # If sublink is a file
+                            if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
+                                return [{
+                                    'url': real_url,
+                                    'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                                    'size': await self.get_file_size(real_url),
+                                    'metadata': {}
+                                }]
+                        # If sublink is a page, check for download links
+                            await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
+                            content = await self.page.content()
+                            soup = BeautifulSoup(content, 'html.parser')
+                        # Find potential download links
+                            links = []
+                            for a in soup.find_all('a', href=True):
+                                href = a['href'].strip()
+                                if 'download' in href.lower() or 'visit.php' in href.lower():
+                                    links.append(href)
+                        # Process each potential download link
+                            sublink_files = []
+                            for href in links:
+                                try:
+                                    if not href.startswith('http'):
+                                        parsed_base = urlparse(real_url)
+                                        base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+                                        href = base_url + ('/' if not href.startswith('/') else '') + href
+                                    final_url, _ = await self.get_real_url(href)
+                                # Add file if it's a valid download
+                                    if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
+                                       any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
+                                        sublink_files.append({
+                                            'url': final_url,
+                                            'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
+                                            'size': await self.get_file_size(final_url),
+                                            'metadata': {}
+                                        })
+                                except Exception as e:
+                                    logger.error(f"Error processing download link {href}: {e}")
+                                    continue
+                            if sublink_files:
+                                logger.info(f"Found {len(sublink_files)} files at {real_url}")
+                                st.write(f"Found {len(sublink_files)} files at {real_url}")
+                            return sublink_files
                     except asyncio.TimeoutError:
                         logger.warning(f"Timeout processing sublink: {sublink}")
                         return []
                     except Exception as e:
                         logger.error(f"Error processing sublink {sublink}: {e}")
                         return []
+            # Process all sublinks concurrently
             tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
             sub_results = await asyncio.gather(*tasks)
             # Combine all results
             for sub_files in sub_results:
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
+        # Make results unique based on URLs
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             # Sort files by name for consistency
             unique_files.sort(key=lambda x: x['filename'].lower())
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
             await asyncio.sleep(2)
             try:
                 progress_text.empty()