euler314 commited on
Commit
d35064f
·
verified ·
1 Parent(s): 9ad3033

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -46
app.py CHANGED
@@ -585,41 +585,32 @@ class DownloadManager:
585
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
586
  if not custom_ext_list:
587
  custom_ext_list = []
588
-
589
  progress_text = st.empty()
590
  progress_bar = st.progress(0)
591
  file_count_text = st.empty()
592
-
593
  try:
594
- # Initialize base domains with the original URL
595
  self.get_base_domain(url)
596
-
597
- # Get the real initial URL
598
- real_url, _ = await self.get_real_url(url)
599
-
600
- # Search main page
601
- progress_text.text("Analyzing main page...")
602
- main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
603
- initial_count = len(main_files)
604
- file_count_text.text(f"Found {initial_count} files on main page")
605
-
606
- # Get and search sublinks
607
- progress_text.text("Getting sublinks...")
608
- sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
609
  total_links = len(sublinks)
610
-
611
  progress_text.text(f"Found {total_links} sublinks to process")
612
-
613
- if not sublinks:
614
  progress_bar.progress(1.0)
615
- return main_files
616
-
617
- # Process sublinks
618
- all_files = main_files.copy()
619
-
 
620
  # Create semaphore for concurrent processing
621
  sem = asyncio.Semaphore(10)
622
-
623
  async def process_sublink(sublink, index):
624
  async with sem:
625
  try:
@@ -627,42 +618,85 @@ class DownloadManager:
627
  progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
628
  progress_bar.progress(progress)
629
 
630
- # Set timeout for this sublink
631
  async with async_timeout.timeout(timeout):
632
- # Get real URL before processing
633
- real_sublink, _ = await self.get_real_url(sublink)
634
- sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list)
635
-
636
- if sub_files:
637
- logger.info(f"Found {len(sub_files)} files at {real_sublink}")
638
- st.write(f"Found {len(sub_files)} files at {real_sublink}")
639
-
640
- return sub_files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  except asyncio.TimeoutError:
642
  logger.warning(f"Timeout processing sublink: {sublink}")
643
  return []
644
  except Exception as e:
645
  logger.error(f"Error processing sublink {sublink}: {e}")
646
  return []
647
-
648
- # Process sublinks with concurrent tasks
649
  tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
650
  sub_results = await asyncio.gather(*tasks)
651
-
652
  # Combine all results
653
  for sub_files in sub_results:
654
  all_files.extend(sub_files)
655
  file_count_text.text(f"Found {len(all_files)} total files")
656
-
657
- # Make results unique based on URLs
658
  seen_urls = set()
659
  unique_files = []
660
-
661
  for f in all_files:
662
  if f['url'] not in seen_urls:
663
  seen_urls.add(f['url'])
664
  unique_files.append(f)
665
-
666
  final_count = len(unique_files)
667
  progress_text.text(f"Deep search complete!")
668
  file_count_text.text(f"Found {final_count} unique files")
@@ -670,16 +704,14 @@ class DownloadManager:
670
 
671
  # Sort files by name for consistency
672
  unique_files.sort(key=lambda x: x['filename'].lower())
673
-
674
  return unique_files
675
-
676
  except Exception as e:
677
  logger.error(f"Deep search error: {e}")
678
  progress_text.text(f"Error during deep search: {str(e)}")
679
  return []
680
-
681
  finally:
682
- # Clean up progress indicators after a delay
683
  await asyncio.sleep(2)
684
  try:
685
  progress_text.empty()
 
585
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
586
  if not custom_ext_list:
587
  custom_ext_list = []
588
+
589
  progress_text = st.empty()
590
  progress_bar = st.progress(0)
591
  file_count_text = st.empty()
592
+
593
  try:
594
+ # Initialize base domains with the original URL
595
  self.get_base_domain(url)
596
+
597
+ # First step: Get all sublinks
598
+ progress_text.text("Getting all sublinks from main page...")
599
+ sublinks = await self.get_sublinks(url, limit=sublink_limit)
 
 
 
 
 
 
 
 
 
600
  total_links = len(sublinks)
601
+
602
  progress_text.text(f"Found {total_links} sublinks to process")
603
+ if total_links == 0:
 
604
  progress_bar.progress(1.0)
605
+ # If no sublinks, try direct file search
606
+ return await self.extract_downloadable_files(url, custom_ext_list)
607
+
608
+ # Process main page and sublinks
609
+ all_files = []
610
+
611
  # Create semaphore for concurrent processing
612
  sem = asyncio.Semaphore(10)
613
+
614
  async def process_sublink(sublink, index):
615
  async with sem:
616
  try:
 
618
  progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
619
  progress_bar.progress(progress)
620
 
 
621
  async with async_timeout.timeout(timeout):
622
+ # First check if sublink itself leads to a file
623
+ real_url, headers = await self.get_real_url(sublink)
624
+ content_type = headers.get('content-type', '').lower()
625
+
626
+ # If sublink is a file
627
+ if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
628
+ return [{
629
+ 'url': real_url,
630
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
631
+ 'size': await self.get_file_size(real_url),
632
+ 'metadata': {}
633
+ }]
634
+
635
+ # If sublink is a page, check for download links
636
+ await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
637
+ content = await self.page.content()
638
+ soup = BeautifulSoup(content, 'html.parser')
639
+
640
+ # Find potential download links
641
+ links = []
642
+ for a in soup.find_all('a', href=True):
643
+ href = a['href'].strip()
644
+ if 'download' in href.lower() or 'visit.php' in href.lower():
645
+ links.append(href)
646
+
647
+ # Process each potential download link
648
+ sublink_files = []
649
+ for href in links:
650
+ try:
651
+ if not href.startswith('http'):
652
+ parsed_base = urlparse(real_url)
653
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
654
+ href = base_url + ('/' if not href.startswith('/') else '') + href
655
+
656
+ final_url, _ = await self.get_real_url(href)
657
+ # Add file if it's a valid download
658
+ if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
659
+ any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
660
+ sublink_files.append({
661
+ 'url': final_url,
662
+ 'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
663
+ 'size': await self.get_file_size(final_url),
664
+ 'metadata': {}
665
+ })
666
+ except Exception as e:
667
+ logger.error(f"Error processing download link {href}: {e}")
668
+ continue
669
+
670
+ if sublink_files:
671
+ logger.info(f"Found {len(sublink_files)} files at {real_url}")
672
+ st.write(f"Found {len(sublink_files)} files at {real_url}")
673
+
674
+ return sublink_files
675
+
676
  except asyncio.TimeoutError:
677
  logger.warning(f"Timeout processing sublink: {sublink}")
678
  return []
679
  except Exception as e:
680
  logger.error(f"Error processing sublink {sublink}: {e}")
681
  return []
682
+
683
+ # Process all sublinks concurrently
684
  tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
685
  sub_results = await asyncio.gather(*tasks)
686
+
687
  # Combine all results
688
  for sub_files in sub_results:
689
  all_files.extend(sub_files)
690
  file_count_text.text(f"Found {len(all_files)} total files")
691
+
692
+ # Make results unique based on URLs
693
  seen_urls = set()
694
  unique_files = []
 
695
  for f in all_files:
696
  if f['url'] not in seen_urls:
697
  seen_urls.add(f['url'])
698
  unique_files.append(f)
699
+
700
  final_count = len(unique_files)
701
  progress_text.text(f"Deep search complete!")
702
  file_count_text.text(f"Found {final_count} unique files")
 
704
 
705
  # Sort files by name for consistency
706
  unique_files.sort(key=lambda x: x['filename'].lower())
707
+
708
  return unique_files
709
+
710
  except Exception as e:
711
  logger.error(f"Deep search error: {e}")
712
  progress_text.text(f"Error during deep search: {str(e)}")
713
  return []
 
714
  finally:
 
715
  await asyncio.sleep(2)
716
  try:
717
  progress_text.empty()