euler314 commited on
Commit
09565a0
·
verified ·
1 Parent(s): d35064f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -42
app.py CHANGED
@@ -582,7 +582,7 @@ class DownloadManager:
582
  logger.error(f"Error getting sublinks: {e}")
583
  return []
584
 
585
- async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
586
  if not custom_ext_list:
587
  custom_ext_list = []
588
 
@@ -591,7 +591,7 @@ class DownloadManager:
591
  file_count_text = st.empty()
592
 
593
  try:
594
- # Initialize base domains with the original URL
595
  self.get_base_domain(url)
596
 
597
  # First step: Get all sublinks
@@ -614,63 +614,59 @@ class DownloadManager:
614
  async def process_sublink(sublink, index):
615
  async with sem:
616
  try:
617
- progress = index/total_links
618
  progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
619
  progress_bar.progress(progress)
620
 
621
  async with async_timeout.timeout(timeout):
622
- # First check if sublink itself leads to a file
623
  real_url, headers = await self.get_real_url(sublink)
624
  content_type = headers.get('content-type', '').lower()
625
-
626
- # If sublink is a file
627
- if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
628
  return [{
629
  'url': real_url,
630
  'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
631
  'size': await self.get_file_size(real_url),
632
  'metadata': {}
633
  }]
634
-
635
- # If sublink is a page, check for download links
636
  await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
637
  content = await self.page.content()
638
  soup = BeautifulSoup(content, 'html.parser')
639
-
640
- # Find potential download links
641
- links = []
 
 
 
 
 
 
642
  for a in soup.find_all('a', href=True):
643
  href = a['href'].strip()
644
- if 'download' in href.lower() or 'visit.php' in href.lower():
645
- links.append(href)
646
-
647
- # Process each potential download link
648
- sublink_files = []
649
- for href in links:
650
- try:
651
- if not href.startswith('http'):
652
- parsed_base = urlparse(real_url)
653
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
654
- href = base_url + ('/' if not href.startswith('/') else '') + href
655
-
656
- final_url, _ = await self.get_real_url(href)
657
- # Add file if it's a valid download
658
- if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
659
- any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
660
- sublink_files.append({
661
- 'url': final_url,
662
- 'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
663
- 'size': await self.get_file_size(final_url),
664
- 'metadata': {}
665
- })
666
- except Exception as e:
667
- logger.error(f"Error processing download link {href}: {e}")
668
  continue
669
-
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  if sublink_files:
671
  logger.info(f"Found {len(sublink_files)} files at {real_url}")
672
  st.write(f"Found {len(sublink_files)} files at {real_url}")
673
-
674
  return sublink_files
675
 
676
  except asyncio.TimeoutError:
@@ -681,7 +677,7 @@ class DownloadManager:
681
  return []
682
 
683
  # Process all sublinks concurrently
684
- tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
685
  sub_results = await asyncio.gather(*tasks)
686
 
687
  # Combine all results
@@ -689,7 +685,7 @@ class DownloadManager:
689
  all_files.extend(sub_files)
690
  file_count_text.text(f"Found {len(all_files)} total files")
691
 
692
- # Make results unique based on URLs
693
  seen_urls = set()
694
  unique_files = []
695
  for f in all_files:
@@ -698,11 +694,11 @@ class DownloadManager:
698
  unique_files.append(f)
699
 
700
  final_count = len(unique_files)
701
- progress_text.text(f"Deep search complete!")
702
  file_count_text.text(f"Found {final_count} unique files")
703
  progress_bar.progress(1.0)
704
 
705
- # Sort files by name for consistency
706
  unique_files.sort(key=lambda x: x['filename'].lower())
707
 
708
  return unique_files
@@ -719,6 +715,7 @@ class DownloadManager:
719
  file_count_text.empty()
720
  except:
721
  pass
 
722
  def main():
723
  if 'initialized' not in st.session_state:
724
  st.session_state.initialized = True
 
582
  logger.error(f"Error getting sublinks: {e}")
583
  return []
584
 
585
+ async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
586
  if not custom_ext_list:
587
  custom_ext_list = []
588
 
 
591
  file_count_text = st.empty()
592
 
593
  try:
594
+ # Initialize base domains with the original URL
595
  self.get_base_domain(url)
596
 
597
  # First step: Get all sublinks
 
614
  async def process_sublink(sublink, index):
615
  async with sem:
616
  try:
617
+ progress = (index) / total_links
618
  progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
619
  progress_bar.progress(progress)
620
 
621
  async with async_timeout.timeout(timeout):
622
+ # Get the final URL and headers for this sublink
623
  real_url, headers = await self.get_real_url(sublink)
624
  content_type = headers.get('content-type', '').lower()
625
+
626
+ # If the sublink itself is a downloadable file, return it
627
+ if any(x in content_type for x in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
628
  return [{
629
  'url': real_url,
630
  'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
631
  'size': await self.get_file_size(real_url),
632
  'metadata': {}
633
  }]
634
+
635
+ # Otherwise, treat it as a webpage and search for file links
636
  await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
637
  content = await self.page.content()
638
  soup = BeautifulSoup(content, 'html.parser')
639
+
640
+ # Define default and custom file extensions
641
+ default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
642
+ '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
643
+ custom_exts = [ext.strip().lower() for ext in custom_ext_list if ext.strip()]
644
+ file_exts = set(default_exts + custom_exts)
645
+
646
+ sublink_files = []
647
+ # Iterate over all anchor tags found on the page
648
  for a in soup.find_all('a', href=True):
649
  href = a['href'].strip()
650
+ if not href:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  continue
652
+ # Convert any relative URL to an absolute URL
653
+ full_url = urljoin(real_url, href)
654
+ if any(full_url.lower().endswith(ext) for ext in file_exts):
655
+ final_url, _ = await self.get_real_url(full_url)
656
+ file_info = {
657
+ 'url': final_url,
658
+ 'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
659
+ 'size': await self.get_file_size(final_url),
660
+ 'metadata': {}
661
+ }
662
+ if final_url.lower().endswith('.pdf'):
663
+ file_info['metadata'] = await self.get_pdf_metadata(final_url)
664
+ sublink_files.append(file_info)
665
+
666
  if sublink_files:
667
  logger.info(f"Found {len(sublink_files)} files at {real_url}")
668
  st.write(f"Found {len(sublink_files)} files at {real_url}")
669
+
670
  return sublink_files
671
 
672
  except asyncio.TimeoutError:
 
677
  return []
678
 
679
  # Process all sublinks concurrently
680
+ tasks = [process_sublink(sublink, i + 1) for i, sublink in enumerate(sublinks)]
681
  sub_results = await asyncio.gather(*tasks)
682
 
683
  # Combine all results
 
685
  all_files.extend(sub_files)
686
  file_count_text.text(f"Found {len(all_files)} total files")
687
 
688
+ # Remove duplicates based on URL
689
  seen_urls = set()
690
  unique_files = []
691
  for f in all_files:
 
694
  unique_files.append(f)
695
 
696
  final_count = len(unique_files)
697
+ progress_text.text("Deep search complete!")
698
  file_count_text.text(f"Found {final_count} unique files")
699
  progress_bar.progress(1.0)
700
 
701
+ # Sort files by filename for consistency
702
  unique_files.sort(key=lambda x: x['filename'].lower())
703
 
704
  return unique_files
 
715
  file_count_text.empty()
716
  except:
717
  pass
718
+
719
  def main():
720
  if 'initialized' not in st.session_state:
721
  st.session_state.initialized = True