Update app.py
Browse files
app.py
CHANGED
@@ -585,41 +585,32 @@ class DownloadManager:
|
|
585 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
|
586 |
if not custom_ext_list:
|
587 |
custom_ext_list = []
|
588 |
-
|
589 |
progress_text = st.empty()
|
590 |
progress_bar = st.progress(0)
|
591 |
file_count_text = st.empty()
|
592 |
-
|
593 |
try:
|
594 |
-
|
595 |
self.get_base_domain(url)
|
596 |
-
|
597 |
-
#
|
598 |
-
|
599 |
-
|
600 |
-
# Search main page
|
601 |
-
progress_text.text("Analyzing main page...")
|
602 |
-
main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
|
603 |
-
initial_count = len(main_files)
|
604 |
-
file_count_text.text(f"Found {initial_count} files on main page")
|
605 |
-
|
606 |
-
# Get and search sublinks
|
607 |
-
progress_text.text("Getting sublinks...")
|
608 |
-
sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
|
609 |
total_links = len(sublinks)
|
610 |
-
|
611 |
progress_text.text(f"Found {total_links} sublinks to process")
|
612 |
-
|
613 |
-
if not sublinks:
|
614 |
progress_bar.progress(1.0)
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
|
|
620 |
# Create semaphore for concurrent processing
|
621 |
sem = asyncio.Semaphore(10)
|
622 |
-
|
623 |
async def process_sublink(sublink, index):
|
624 |
async with sem:
|
625 |
try:
|
@@ -627,42 +618,85 @@ class DownloadManager:
|
|
627 |
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
628 |
progress_bar.progress(progress)
|
629 |
|
630 |
-
# Set timeout for this sublink
|
631 |
async with async_timeout.timeout(timeout):
|
632 |
-
#
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
except asyncio.TimeoutError:
|
642 |
logger.warning(f"Timeout processing sublink: {sublink}")
|
643 |
return []
|
644 |
except Exception as e:
|
645 |
logger.error(f"Error processing sublink {sublink}: {e}")
|
646 |
return []
|
647 |
-
|
648 |
-
# Process sublinks
|
649 |
tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
|
650 |
sub_results = await asyncio.gather(*tasks)
|
651 |
-
|
652 |
# Combine all results
|
653 |
for sub_files in sub_results:
|
654 |
all_files.extend(sub_files)
|
655 |
file_count_text.text(f"Found {len(all_files)} total files")
|
656 |
-
|
657 |
-
|
658 |
seen_urls = set()
|
659 |
unique_files = []
|
660 |
-
|
661 |
for f in all_files:
|
662 |
if f['url'] not in seen_urls:
|
663 |
seen_urls.add(f['url'])
|
664 |
unique_files.append(f)
|
665 |
-
|
666 |
final_count = len(unique_files)
|
667 |
progress_text.text(f"Deep search complete!")
|
668 |
file_count_text.text(f"Found {final_count} unique files")
|
@@ -670,16 +704,14 @@ class DownloadManager:
|
|
670 |
|
671 |
# Sort files by name for consistency
|
672 |
unique_files.sort(key=lambda x: x['filename'].lower())
|
673 |
-
|
674 |
return unique_files
|
675 |
-
|
676 |
except Exception as e:
|
677 |
logger.error(f"Deep search error: {e}")
|
678 |
progress_text.text(f"Error during deep search: {str(e)}")
|
679 |
return []
|
680 |
-
|
681 |
finally:
|
682 |
-
# Clean up progress indicators after a delay
|
683 |
await asyncio.sleep(2)
|
684 |
try:
|
685 |
progress_text.empty()
|
|
|
585 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
|
586 |
if not custom_ext_list:
|
587 |
custom_ext_list = []
|
588 |
+
|
589 |
progress_text = st.empty()
|
590 |
progress_bar = st.progress(0)
|
591 |
file_count_text = st.empty()
|
592 |
+
|
593 |
try:
|
594 |
+
# Initialize base domains with the original URL
|
595 |
self.get_base_domain(url)
|
596 |
+
|
597 |
+
# First step: Get all sublinks
|
598 |
+
progress_text.text("Getting all sublinks from main page...")
|
599 |
+
sublinks = await self.get_sublinks(url, limit=sublink_limit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
600 |
total_links = len(sublinks)
|
601 |
+
|
602 |
progress_text.text(f"Found {total_links} sublinks to process")
|
603 |
+
if total_links == 0:
|
|
|
604 |
progress_bar.progress(1.0)
|
605 |
+
# If no sublinks, try direct file search
|
606 |
+
return await self.extract_downloadable_files(url, custom_ext_list)
|
607 |
+
|
608 |
+
# Process main page and sublinks
|
609 |
+
all_files = []
|
610 |
+
|
611 |
# Create semaphore for concurrent processing
|
612 |
sem = asyncio.Semaphore(10)
|
613 |
+
|
614 |
async def process_sublink(sublink, index):
|
615 |
async with sem:
|
616 |
try:
|
|
|
618 |
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
619 |
progress_bar.progress(progress)
|
620 |
|
|
|
621 |
async with async_timeout.timeout(timeout):
|
622 |
+
# First check if sublink itself leads to a file
|
623 |
+
real_url, headers = await self.get_real_url(sublink)
|
624 |
+
content_type = headers.get('content-type', '').lower()
|
625 |
+
|
626 |
+
# If sublink is a file
|
627 |
+
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
628 |
+
return [{
|
629 |
+
'url': real_url,
|
630 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
631 |
+
'size': await self.get_file_size(real_url),
|
632 |
+
'metadata': {}
|
633 |
+
}]
|
634 |
+
|
635 |
+
# If sublink is a page, check for download links
|
636 |
+
await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
|
637 |
+
content = await self.page.content()
|
638 |
+
soup = BeautifulSoup(content, 'html.parser')
|
639 |
+
|
640 |
+
# Find potential download links
|
641 |
+
links = []
|
642 |
+
for a in soup.find_all('a', href=True):
|
643 |
+
href = a['href'].strip()
|
644 |
+
if 'download' in href.lower() or 'visit.php' in href.lower():
|
645 |
+
links.append(href)
|
646 |
+
|
647 |
+
# Process each potential download link
|
648 |
+
sublink_files = []
|
649 |
+
for href in links:
|
650 |
+
try:
|
651 |
+
if not href.startswith('http'):
|
652 |
+
parsed_base = urlparse(real_url)
|
653 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
654 |
+
href = base_url + ('/' if not href.startswith('/') else '') + href
|
655 |
+
|
656 |
+
final_url, _ = await self.get_real_url(href)
|
657 |
+
# Add file if it's a valid download
|
658 |
+
if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
|
659 |
+
any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
|
660 |
+
sublink_files.append({
|
661 |
+
'url': final_url,
|
662 |
+
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
|
663 |
+
'size': await self.get_file_size(final_url),
|
664 |
+
'metadata': {}
|
665 |
+
})
|
666 |
+
except Exception as e:
|
667 |
+
logger.error(f"Error processing download link {href}: {e}")
|
668 |
+
continue
|
669 |
+
|
670 |
+
if sublink_files:
|
671 |
+
logger.info(f"Found {len(sublink_files)} files at {real_url}")
|
672 |
+
st.write(f"Found {len(sublink_files)} files at {real_url}")
|
673 |
+
|
674 |
+
return sublink_files
|
675 |
+
|
676 |
except asyncio.TimeoutError:
|
677 |
logger.warning(f"Timeout processing sublink: {sublink}")
|
678 |
return []
|
679 |
except Exception as e:
|
680 |
logger.error(f"Error processing sublink {sublink}: {e}")
|
681 |
return []
|
682 |
+
|
683 |
+
# Process all sublinks concurrently
|
684 |
tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
|
685 |
sub_results = await asyncio.gather(*tasks)
|
686 |
+
|
687 |
# Combine all results
|
688 |
for sub_files in sub_results:
|
689 |
all_files.extend(sub_files)
|
690 |
file_count_text.text(f"Found {len(all_files)} total files")
|
691 |
+
|
692 |
+
# Make results unique based on URLs
|
693 |
seen_urls = set()
|
694 |
unique_files = []
|
|
|
695 |
for f in all_files:
|
696 |
if f['url'] not in seen_urls:
|
697 |
seen_urls.add(f['url'])
|
698 |
unique_files.append(f)
|
699 |
+
|
700 |
final_count = len(unique_files)
|
701 |
progress_text.text(f"Deep search complete!")
|
702 |
file_count_text.text(f"Found {final_count} unique files")
|
|
|
704 |
|
705 |
# Sort files by name for consistency
|
706 |
unique_files.sort(key=lambda x: x['filename'].lower())
|
707 |
+
|
708 |
return unique_files
|
709 |
+
|
710 |
except Exception as e:
|
711 |
logger.error(f"Deep search error: {e}")
|
712 |
progress_text.text(f"Error during deep search: {str(e)}")
|
713 |
return []
|
|
|
714 |
finally:
|
|
|
715 |
await asyncio.sleep(2)
|
716 |
try:
|
717 |
progress_text.empty()
|