Update app.py
Browse files
app.py
CHANGED
@@ -582,7 +582,7 @@ class DownloadManager:
|
|
582 |
logger.error(f"Error getting sublinks: {e}")
|
583 |
return []
|
584 |
|
585 |
-
|
586 |
if not custom_ext_list:
|
587 |
custom_ext_list = []
|
588 |
|
@@ -591,7 +591,7 @@ class DownloadManager:
|
|
591 |
file_count_text = st.empty()
|
592 |
|
593 |
try:
|
594 |
-
|
595 |
self.get_base_domain(url)
|
596 |
|
597 |
# First step: Get all sublinks
|
@@ -614,63 +614,59 @@ class DownloadManager:
|
|
614 |
async def process_sublink(sublink, index):
|
615 |
async with sem:
|
616 |
try:
|
617 |
-
progress = index/total_links
|
618 |
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
619 |
progress_bar.progress(progress)
|
620 |
|
621 |
async with async_timeout.timeout(timeout):
|
622 |
-
#
|
623 |
real_url, headers = await self.get_real_url(sublink)
|
624 |
content_type = headers.get('content-type', '').lower()
|
625 |
-
|
626 |
-
# If sublink is a file
|
627 |
-
if any(
|
628 |
return [{
|
629 |
'url': real_url,
|
630 |
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
631 |
'size': await self.get_file_size(real_url),
|
632 |
'metadata': {}
|
633 |
}]
|
634 |
-
|
635 |
-
|
636 |
await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
|
637 |
content = await self.page.content()
|
638 |
soup = BeautifulSoup(content, 'html.parser')
|
639 |
-
|
640 |
-
|
641 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
642 |
for a in soup.find_all('a', href=True):
|
643 |
href = a['href'].strip()
|
644 |
-
if
|
645 |
-
links.append(href)
|
646 |
-
|
647 |
-
# Process each potential download link
|
648 |
-
sublink_files = []
|
649 |
-
for href in links:
|
650 |
-
try:
|
651 |
-
if not href.startswith('http'):
|
652 |
-
parsed_base = urlparse(real_url)
|
653 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
654 |
-
href = base_url + ('/' if not href.startswith('/') else '') + href
|
655 |
-
|
656 |
-
final_url, _ = await self.get_real_url(href)
|
657 |
-
# Add file if it's a valid download
|
658 |
-
if any(final_url.lower().endswith(ext) for ext in custom_ext_list) or \
|
659 |
-
any(ext in await self.page.evaluate('() => document.contentType') for ext in ['pdf', 'zip']):
|
660 |
-
sublink_files.append({
|
661 |
-
'url': final_url,
|
662 |
-
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
|
663 |
-
'size': await self.get_file_size(final_url),
|
664 |
-
'metadata': {}
|
665 |
-
})
|
666 |
-
except Exception as e:
|
667 |
-
logger.error(f"Error processing download link {href}: {e}")
|
668 |
continue
|
669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
670 |
if sublink_files:
|
671 |
logger.info(f"Found {len(sublink_files)} files at {real_url}")
|
672 |
st.write(f"Found {len(sublink_files)} files at {real_url}")
|
673 |
-
|
674 |
return sublink_files
|
675 |
|
676 |
except asyncio.TimeoutError:
|
@@ -681,7 +677,7 @@ class DownloadManager:
|
|
681 |
return []
|
682 |
|
683 |
# Process all sublinks concurrently
|
684 |
-
tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
|
685 |
sub_results = await asyncio.gather(*tasks)
|
686 |
|
687 |
# Combine all results
|
@@ -689,7 +685,7 @@ class DownloadManager:
|
|
689 |
all_files.extend(sub_files)
|
690 |
file_count_text.text(f"Found {len(all_files)} total files")
|
691 |
|
692 |
-
|
693 |
seen_urls = set()
|
694 |
unique_files = []
|
695 |
for f in all_files:
|
@@ -698,11 +694,11 @@ class DownloadManager:
|
|
698 |
unique_files.append(f)
|
699 |
|
700 |
final_count = len(unique_files)
|
701 |
-
progress_text.text(
|
702 |
file_count_text.text(f"Found {final_count} unique files")
|
703 |
progress_bar.progress(1.0)
|
704 |
|
705 |
-
# Sort files by
|
706 |
unique_files.sort(key=lambda x: x['filename'].lower())
|
707 |
|
708 |
return unique_files
|
@@ -719,6 +715,7 @@ class DownloadManager:
|
|
719 |
file_count_text.empty()
|
720 |
except:
|
721 |
pass
|
|
|
722 |
def main():
|
723 |
if 'initialized' not in st.session_state:
|
724 |
st.session_state.initialized = True
|
|
|
582 |
logger.error(f"Error getting sublinks: {e}")
|
583 |
return []
|
584 |
|
585 |
+
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
|
586 |
if not custom_ext_list:
|
587 |
custom_ext_list = []
|
588 |
|
|
|
591 |
file_count_text = st.empty()
|
592 |
|
593 |
try:
|
594 |
+
# Initialize base domains with the original URL
|
595 |
self.get_base_domain(url)
|
596 |
|
597 |
# First step: Get all sublinks
|
|
|
614 |
async def process_sublink(sublink, index):
|
615 |
async with sem:
|
616 |
try:
|
617 |
+
progress = (index) / total_links
|
618 |
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
619 |
progress_bar.progress(progress)
|
620 |
|
621 |
async with async_timeout.timeout(timeout):
|
622 |
+
# Get the final URL and headers for this sublink
|
623 |
real_url, headers = await self.get_real_url(sublink)
|
624 |
content_type = headers.get('content-type', '').lower()
|
625 |
+
|
626 |
+
# If the sublink itself is a downloadable file, return it
|
627 |
+
if any(x in content_type for x in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
628 |
return [{
|
629 |
'url': real_url,
|
630 |
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
631 |
'size': await self.get_file_size(real_url),
|
632 |
'metadata': {}
|
633 |
}]
|
634 |
+
|
635 |
+
# Otherwise, treat it as a webpage and search for file links
|
636 |
await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
|
637 |
content = await self.page.content()
|
638 |
soup = BeautifulSoup(content, 'html.parser')
|
639 |
+
|
640 |
+
# Define default and custom file extensions
|
641 |
+
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
642 |
+
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
643 |
+
custom_exts = [ext.strip().lower() for ext in custom_ext_list if ext.strip()]
|
644 |
+
file_exts = set(default_exts + custom_exts)
|
645 |
+
|
646 |
+
sublink_files = []
|
647 |
+
# Iterate over all anchor tags found on the page
|
648 |
for a in soup.find_all('a', href=True):
|
649 |
href = a['href'].strip()
|
650 |
+
if not href:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
continue
|
652 |
+
# Convert any relative URL to an absolute URL
|
653 |
+
full_url = urljoin(real_url, href)
|
654 |
+
if any(full_url.lower().endswith(ext) for ext in file_exts):
|
655 |
+
final_url, _ = await self.get_real_url(full_url)
|
656 |
+
file_info = {
|
657 |
+
'url': final_url,
|
658 |
+
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
|
659 |
+
'size': await self.get_file_size(final_url),
|
660 |
+
'metadata': {}
|
661 |
+
}
|
662 |
+
if final_url.lower().endswith('.pdf'):
|
663 |
+
file_info['metadata'] = await self.get_pdf_metadata(final_url)
|
664 |
+
sublink_files.append(file_info)
|
665 |
+
|
666 |
if sublink_files:
|
667 |
logger.info(f"Found {len(sublink_files)} files at {real_url}")
|
668 |
st.write(f"Found {len(sublink_files)} files at {real_url}")
|
669 |
+
|
670 |
return sublink_files
|
671 |
|
672 |
except asyncio.TimeoutError:
|
|
|
677 |
return []
|
678 |
|
679 |
# Process all sublinks concurrently
|
680 |
+
tasks = [process_sublink(sublink, i + 1) for i, sublink in enumerate(sublinks)]
|
681 |
sub_results = await asyncio.gather(*tasks)
|
682 |
|
683 |
# Combine all results
|
|
|
685 |
all_files.extend(sub_files)
|
686 |
file_count_text.text(f"Found {len(all_files)} total files")
|
687 |
|
688 |
+
# Remove duplicates based on URL
|
689 |
seen_urls = set()
|
690 |
unique_files = []
|
691 |
for f in all_files:
|
|
|
694 |
unique_files.append(f)
|
695 |
|
696 |
final_count = len(unique_files)
|
697 |
+
progress_text.text("Deep search complete!")
|
698 |
file_count_text.text(f"Found {final_count} unique files")
|
699 |
progress_bar.progress(1.0)
|
700 |
|
701 |
+
# Sort files by filename for consistency
|
702 |
unique_files.sort(key=lambda x: x['filename'].lower())
|
703 |
|
704 |
return unique_files
|
|
|
715 |
file_count_text.empty()
|
716 |
except:
|
717 |
pass
|
718 |
+
|
719 |
def main():
|
720 |
if 'initialized' not in st.session_state:
|
721 |
st.session_state.initialized = True
|