Update app.py
Browse files
app.py
CHANGED
@@ -8,10 +8,10 @@ import sys
|
|
8 |
|
9 |
def install_playwright_dependencies():
|
10 |
try:
|
11 |
-
#
|
12 |
os.system('apt-get update -y')
|
13 |
|
14 |
-
# Install required dependencies
|
15 |
dependencies = [
|
16 |
'libnss3',
|
17 |
'libnspr4',
|
@@ -20,7 +20,9 @@ def install_playwright_dependencies():
|
|
20 |
'libcups2',
|
21 |
'libxcomposite1',
|
22 |
'libxdamage1',
|
23 |
-
'libatspi2.0-0'
|
|
|
|
|
24 |
]
|
25 |
|
26 |
dependency_command = f"apt-get install -y {' '.join(dependencies)}"
|
@@ -281,7 +283,7 @@ class DownloadManager:
|
|
281 |
self.browser = None
|
282 |
self.context = None
|
283 |
self.page = None
|
284 |
-
|
285 |
async def __aenter__(self):
|
286 |
self.playwright = await async_playwright().start()
|
287 |
opts = {"headless": True}
|
@@ -337,25 +339,12 @@ class DownloadManager:
|
|
337 |
try:
|
338 |
async with self.context.new_page() as page:
|
339 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
340 |
-
|
341 |
-
# Check if the response is a redirect
|
342 |
if response and response.headers.get('location'):
|
343 |
return response.headers['location']
|
344 |
-
|
345 |
-
# Check if response is a file
|
346 |
content_type = response.headers.get('content-type', '')
|
347 |
if 'text/html' not in content_type.lower():
|
348 |
return url
|
349 |
-
|
350 |
-
# Look for meta refresh
|
351 |
content = await page.content()
|
352 |
-
soup = BeautifulSoup(content, 'html.parser')
|
353 |
-
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
|
354 |
-
if meta_refresh:
|
355 |
-
content = meta_refresh.get('content', '')
|
356 |
-
if 'url=' in content.lower():
|
357 |
-
return content.split('url=')[-1].strip()
|
358 |
-
|
359 |
return page.url
|
360 |
except Exception as e:
|
361 |
logger.error(f"Error extracting real download URL: {e}")
|
@@ -364,82 +353,53 @@ class DownloadManager:
|
|
364 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
365 |
found_files = []
|
366 |
try:
|
367 |
-
# First try to load the page
|
368 |
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
369 |
if not response:
|
370 |
return []
|
371 |
|
372 |
final_url = self.page.url
|
373 |
-
|
374 |
-
# Handle redirects and download scripts
|
375 |
-
if '.php' in final_url or 'download' in final_url or 'get' in final_url:
|
376 |
real_url = await self.extract_real_download_url(final_url)
|
377 |
if real_url != final_url:
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
})
|
386 |
-
return found_files
|
387 |
|
388 |
await self.page.wait_for_load_state('networkidle', timeout=30000)
|
389 |
-
await human_like_interactions(self.page)
|
390 |
-
|
391 |
content = await self.page.content()
|
392 |
soup = BeautifulSoup(content, 'html.parser')
|
393 |
|
394 |
-
|
395 |
-
|
396 |
-
'.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt']
|
397 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
398 |
|
399 |
-
# Parse base URL for relative links
|
400 |
parsed_base = urlparse(final_url)
|
401 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
402 |
|
403 |
-
# Find all links
|
404 |
for a in soup.find_all('a', href=True):
|
405 |
href = a['href'].strip()
|
406 |
-
|
407 |
-
# Skip empty or javascript links
|
408 |
-
if not href or href.startswith('javascript:') or href == '#':
|
409 |
-
continue
|
410 |
-
|
411 |
-
# Handle special cases (PHP scripts, download handlers)
|
412 |
-
if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower():
|
413 |
-
full_url = href if href.startswith('http') else urljoin(base_url, href)
|
414 |
-
real_url = await self.extract_real_download_url(full_url)
|
415 |
-
if real_url and real_url != full_url:
|
416 |
-
size_str = await self.get_file_size(real_url)
|
417 |
-
found_files.append({
|
418 |
-
'url': real_url,
|
419 |
-
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
420 |
-
'size': size_str,
|
421 |
-
'metadata': {}
|
422 |
-
})
|
423 |
-
continue
|
424 |
-
|
425 |
-
# Handle direct file links
|
426 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
427 |
-
file_url = href if href.startswith('http') else
|
|
|
|
|
|
|
428 |
size_str = await self.get_file_size(file_url)
|
429 |
meta = {}
|
430 |
-
|
431 |
if file_url.lower().endswith('.pdf'):
|
432 |
meta = await self.get_pdf_metadata(file_url)
|
433 |
-
|
434 |
found_files.append({
|
435 |
'url': file_url,
|
436 |
-
'filename': os.path.basename(
|
437 |
'size': size_str,
|
438 |
'metadata': meta
|
439 |
})
|
440 |
-
|
441 |
-
|
442 |
-
elif any(x in href for x in ['drive.google.com', 'docs.google.com']):
|
443 |
file_id = None
|
444 |
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
445 |
match = re.search(pattern, href)
|
@@ -449,35 +409,25 @@ class DownloadManager:
|
|
449 |
|
450 |
if file_id:
|
451 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
logger.error(f"Error processing Google Drive link: {e}")
|
470 |
-
|
471 |
-
# Make list unique based on URLs
|
472 |
-
seen_urls = set()
|
473 |
-
unique_files = []
|
474 |
-
for f in found_files:
|
475 |
-
if f['url'] not in seen_urls:
|
476 |
-
seen_urls.add(f['url'])
|
477 |
-
unique_files.append(f)
|
478 |
-
|
479 |
-
return unique_files
|
480 |
|
|
|
481 |
except Exception as e:
|
482 |
logger.error(f"Error extracting files from {url}: {e}")
|
483 |
return []
|
@@ -486,8 +436,6 @@ class DownloadManager:
|
|
486 |
file_url = file_info['url']
|
487 |
fname = file_info['filename']
|
488 |
path = os.path.join(save_dir, fname)
|
489 |
-
|
490 |
-
# Handle duplicate filenames
|
491 |
base, ext = os.path.splitext(fname)
|
492 |
counter = 1
|
493 |
while os.path.exists(path):
|
@@ -497,8 +445,7 @@ class DownloadManager:
|
|
497 |
os.makedirs(save_dir, exist_ok=True)
|
498 |
|
499 |
try:
|
500 |
-
|
501 |
-
if 'drive.google.com' in file_url:
|
502 |
import gdown
|
503 |
try:
|
504 |
st.write(f"Downloading from Google Drive: {fname}")
|
@@ -510,7 +457,6 @@ class DownloadManager:
|
|
510 |
logger.error(f"Google Drive download error: {e}")
|
511 |
return None
|
512 |
|
513 |
-
# Handle normal downloads
|
514 |
async with self.context.new_page() as page:
|
515 |
st.write(f"Downloading: {fname}")
|
516 |
|
@@ -637,7 +583,6 @@ class DownloadManager:
|
|
637 |
logger.error(f"Deep search error: {e}")
|
638 |
return []
|
639 |
|
640 |
-
# ---------- Main Streamlit UI Implementation -------------
|
641 |
def main():
|
642 |
if 'initialized' not in st.session_state:
|
643 |
st.session_state.initialized = True
|
@@ -647,7 +592,6 @@ def main():
|
|
647 |
|
648 |
st.title("Advanced File Downloader")
|
649 |
|
650 |
-
# Sidebar for settings
|
651 |
with st.sidebar:
|
652 |
st.header("Settings")
|
653 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
@@ -657,123 +601,63 @@ def main():
|
|
657 |
"Custom File Extensions",
|
658 |
placeholder=".csv, .txt, .epub"
|
659 |
)
|
660 |
-
max_concurrency = st.slider(
|
661 |
-
"Max Concurrency",
|
662 |
-
min_value=1,
|
663 |
-
max_value=1000,
|
664 |
-
value=200
|
665 |
-
)
|
666 |
use_proxy = st.checkbox("Use Proxy")
|
667 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
668 |
|
669 |
-
# Google OAuth Section
|
670 |
-
with st.expander("Google Drive Integration"):
|
671 |
-
if st.button("Start Google Sign-In"):
|
672 |
-
auth_url = get_google_auth_url()
|
673 |
-
st.markdown(f"[Click here to authorize]({auth_url})")
|
674 |
-
|
675 |
-
auth_code = st.text_input("Enter authorization code")
|
676 |
-
if st.button("Complete Sign-In") and auth_code:
|
677 |
-
creds, msg = exchange_code_for_credentials(auth_code)
|
678 |
-
st.session_state.google_creds = creds
|
679 |
-
st.write(msg)
|
680 |
-
|
681 |
-
# Main content area
|
682 |
if mode == "Manual URL":
|
683 |
st.header("Manual URL Mode")
|
684 |
url = st.text_input("Enter URL", placeholder="https://example.com")
|
685 |
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
699 |
-
max_concurrency=max_concurrency
|
700 |
-
)
|
701 |
-
st.session_state.discovered_files = files
|
702 |
-
st.session_state.current_url = url
|
703 |
-
return files
|
704 |
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
else:
|
709 |
-
st.warning("No files found.")
|
710 |
-
|
711 |
-
with col2:
|
712 |
-
if st.button("Preview Page", use_container_width=True):
|
713 |
-
if url:
|
714 |
-
async def preview():
|
715 |
-
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
716 |
-
with st.spinner("Loading preview..."):
|
717 |
-
return await dm.preview_page(url)
|
718 |
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
# File selection and download section
|
723 |
-
if st.session_state.discovered_files:
|
724 |
-
with st.expander("Download Options", expanded=True):
|
725 |
-
file_options = [f"{f['filename']} ({f['size']})" for f in st.session_state.discovered_files]
|
726 |
-
selected_indices = st.multiselect(
|
727 |
-
"Select files to download",
|
728 |
-
range(len(file_options)),
|
729 |
-
format_func=lambda x: file_options[x]
|
730 |
-
)
|
731 |
-
|
732 |
-
if selected_indices:
|
733 |
-
download_dir = st.text_input("Download Directory", value="./downloads")
|
734 |
-
delete_after = st.checkbox("Delete after creating ZIP?")
|
735 |
-
upload_drive = st.checkbox("Upload to Google Drive?")
|
736 |
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
|
|
|
|
|
|
|
|
756 |
|
757 |
-
|
758 |
-
if
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
zf.write(p, arcname=os.path.basename(p))
|
763 |
-
|
764 |
-
if upload_drive and st.session_state.google_creds:
|
765 |
-
file_id = google_drive_upload(tmp.name, st.session_state.google_creds)
|
766 |
-
if file_id and not isinstance(file_id, str):
|
767 |
-
st.success(f"Uploaded to Google Drive! File ID: {file_id}")
|
768 |
-
else:
|
769 |
-
st.error("Failed to upload to Google Drive")
|
770 |
-
|
771 |
-
if delete_after:
|
772 |
-
for p in downloaded_paths:
|
773 |
-
try:
|
774 |
-
os.remove(p)
|
775 |
-
except:
|
776 |
-
pass
|
777 |
|
778 |
elif mode == "Bing Search":
|
779 |
st.header("Bing Search Mode")
|
@@ -790,35 +674,52 @@ def main():
|
|
790 |
num_results=num_results
|
791 |
) as dm:
|
792 |
with st.spinner("Searching..."):
|
793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
|
795 |
-
|
796 |
-
if urls:
|
797 |
-
st.success(f"Found {len(urls)} results!")
|
798 |
-
for i, (url, info) in enumerate(zip(urls, info), 1):
|
799 |
-
with st.expander(f"Result {i}: {url}", expanded=i==1):
|
800 |
-
st.write(f"Snippet: {info['snippet']}")
|
801 |
-
if info['entities']:
|
802 |
-
st.write("Entities:", ', '.join(f"{e[0]} ({e[1]})" for e in info['entities']))
|
803 |
-
|
804 |
-
if st.button(f"Deep Search This Result {i}"):
|
805 |
-
st.session_state.current_url = url
|
806 |
-
async def search_result():
|
807 |
-
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
808 |
-
return await dm.deep_search(
|
809 |
-
url=url,
|
810 |
-
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
811 |
-
max_concurrency=max_concurrency
|
812 |
-
)
|
813 |
-
|
814 |
-
files = asyncio.run(search_result())
|
815 |
-
if files:
|
816 |
-
st.session_state.discovered_files = files
|
817 |
-
st.success(f"Found {len(files)} files!")
|
818 |
-
else:
|
819 |
-
st.warning("No files found.")
|
820 |
-
else:
|
821 |
-
st.warning("No results found.")
|
822 |
|
823 |
else: # PDF Summarizer mode
|
824 |
st.header("PDF Summarizer")
|
@@ -826,9 +727,14 @@ def main():
|
|
826 |
|
827 |
if st.button("Summarize"):
|
828 |
if pdf_url:
|
829 |
-
|
830 |
-
|
831 |
-
|
|
|
832 |
|
833 |
if __name__ == "__main__":
|
834 |
-
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def install_playwright_dependencies():
|
10 |
try:
|
11 |
+
# Update package list
|
12 |
os.system('apt-get update -y')
|
13 |
|
14 |
+
# Install required dependencies including GTK
|
15 |
dependencies = [
|
16 |
'libnss3',
|
17 |
'libnspr4',
|
|
|
20 |
'libcups2',
|
21 |
'libxcomposite1',
|
22 |
'libxdamage1',
|
23 |
+
'libatspi2.0-0',
|
24 |
+
'libgtk-3-0', # Add GTK dependencies
|
25 |
+
'libgdk-3-0'
|
26 |
]
|
27 |
|
28 |
dependency_command = f"apt-get install -y {' '.join(dependencies)}"
|
|
|
283 |
self.browser = None
|
284 |
self.context = None
|
285 |
self.page = None
|
286 |
+
|
287 |
async def __aenter__(self):
|
288 |
self.playwright = await async_playwright().start()
|
289 |
opts = {"headless": True}
|
|
|
339 |
try:
|
340 |
async with self.context.new_page() as page:
|
341 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
|
|
|
|
342 |
if response and response.headers.get('location'):
|
343 |
return response.headers['location']
|
|
|
|
|
344 |
content_type = response.headers.get('content-type', '')
|
345 |
if 'text/html' not in content_type.lower():
|
346 |
return url
|
|
|
|
|
347 |
content = await page.content()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
return page.url
|
349 |
except Exception as e:
|
350 |
logger.error(f"Error extracting real download URL: {e}")
|
|
|
353 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
354 |
found_files = []
|
355 |
try:
|
|
|
356 |
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
357 |
if not response:
|
358 |
return []
|
359 |
|
360 |
final_url = self.page.url
|
361 |
+
if '.php' in final_url or 'download' in final_url:
|
|
|
|
|
362 |
real_url = await self.extract_real_download_url(final_url)
|
363 |
if real_url != final_url:
|
364 |
+
found_files.append({
|
365 |
+
'url': real_url,
|
366 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
367 |
+
'size': await self.get_file_size(real_url),
|
368 |
+
'metadata': {}
|
369 |
+
})
|
370 |
+
return found_files
|
|
|
|
|
371 |
|
372 |
await self.page.wait_for_load_state('networkidle', timeout=30000)
|
|
|
|
|
373 |
content = await self.page.content()
|
374 |
soup = BeautifulSoup(content, 'html.parser')
|
375 |
|
376 |
+
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
377 |
+
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
|
|
378 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
379 |
|
|
|
380 |
parsed_base = urlparse(final_url)
|
381 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
382 |
|
|
|
383 |
for a in soup.find_all('a', href=True):
|
384 |
href = a['href'].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
386 |
+
file_url = href if href.startswith('http') else (
|
387 |
+
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
388 |
+
)
|
389 |
+
|
390 |
size_str = await self.get_file_size(file_url)
|
391 |
meta = {}
|
|
|
392 |
if file_url.lower().endswith('.pdf'):
|
393 |
meta = await self.get_pdf_metadata(file_url)
|
394 |
+
|
395 |
found_files.append({
|
396 |
'url': file_url,
|
397 |
+
'filename': os.path.basename(file_url.split('?')[0]),
|
398 |
'size': size_str,
|
399 |
'metadata': meta
|
400 |
})
|
401 |
+
|
402 |
+
elif ("drive.google.com" in href) or ("docs.google.com" in href):
|
|
|
403 |
file_id = None
|
404 |
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
405 |
match = re.search(pattern, href)
|
|
|
409 |
|
410 |
if file_id:
|
411 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
412 |
+
filename = file_id
|
413 |
+
try:
|
414 |
+
response = await self.page.request.head(direct_url, timeout=15000)
|
415 |
+
cd = response.headers.get("Content-Disposition", "")
|
416 |
+
if cd:
|
417 |
+
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
418 |
+
if mt:
|
419 |
+
filename = mt.group(1).strip('"').strip()
|
420 |
+
|
421 |
+
found_files.append({
|
422 |
+
'url': direct_url,
|
423 |
+
'filename': filename,
|
424 |
+
'size': await self.get_file_size(direct_url),
|
425 |
+
'metadata': {}
|
426 |
+
})
|
427 |
+
except Exception as e:
|
428 |
+
logger.error(f"Error processing Google Drive link: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
|
430 |
+
return found_files
|
431 |
except Exception as e:
|
432 |
logger.error(f"Error extracting files from {url}: {e}")
|
433 |
return []
|
|
|
436 |
file_url = file_info['url']
|
437 |
fname = file_info['filename']
|
438 |
path = os.path.join(save_dir, fname)
|
|
|
|
|
439 |
base, ext = os.path.splitext(fname)
|
440 |
counter = 1
|
441 |
while os.path.exists(path):
|
|
|
445 |
os.makedirs(save_dir, exist_ok=True)
|
446 |
|
447 |
try:
|
448 |
+
if "drive.google.com" in file_url:
|
|
|
449 |
import gdown
|
450 |
try:
|
451 |
st.write(f"Downloading from Google Drive: {fname}")
|
|
|
457 |
logger.error(f"Google Drive download error: {e}")
|
458 |
return None
|
459 |
|
|
|
460 |
async with self.context.new_page() as page:
|
461 |
st.write(f"Downloading: {fname}")
|
462 |
|
|
|
583 |
logger.error(f"Deep search error: {e}")
|
584 |
return []
|
585 |
|
|
|
586 |
def main():
|
587 |
if 'initialized' not in st.session_state:
|
588 |
st.session_state.initialized = True
|
|
|
592 |
|
593 |
st.title("Advanced File Downloader")
|
594 |
|
|
|
595 |
with st.sidebar:
|
596 |
st.header("Settings")
|
597 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
|
|
601 |
"Custom File Extensions",
|
602 |
placeholder=".csv, .txt, .epub"
|
603 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
use_proxy = st.checkbox("Use Proxy")
|
605 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
606 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
607 |
if mode == "Manual URL":
|
608 |
st.header("Manual URL Mode")
|
609 |
url = st.text_input("Enter URL", placeholder="https://example.com")
|
610 |
|
611 |
+
if st.button("Deep Search", use_container_width=True):
|
612 |
+
if url:
|
613 |
+
async def run_deep_search():
|
614 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
615 |
+
with st.spinner("Searching for files..."):
|
616 |
+
files = await dm.deep_search(
|
617 |
+
url=url,
|
618 |
+
custom_ext_list=custom_extensions.split(',') if custom_extensions else []
|
619 |
+
)
|
620 |
+
st.session_state.discovered_files = files
|
621 |
+
st.session_state.current_url = url
|
622 |
+
return files
|
|
|
|
|
|
|
|
|
|
|
|
|
623 |
|
624 |
+
files = asyncio.run(run_deep_search())
|
625 |
+
if files:
|
626 |
+
st.success(f"Found {len(files)} files!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
|
628 |
+
# Display files
|
629 |
+
for file in files:
|
630 |
+
st.write(f"- {file['filename']} ({file['size']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
|
632 |
+
# Download section
|
633 |
+
selected_files = st.multiselect(
|
634 |
+
"Select files to download",
|
635 |
+
range(len(files)),
|
636 |
+
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
|
637 |
+
)
|
638 |
+
|
639 |
+
if selected_files:
|
640 |
+
download_dir = st.text_input("Download Directory", value="./downloads")
|
641 |
+
if st.button("Download Selected"):
|
642 |
+
async def download_files():
|
643 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
644 |
+
paths = []
|
645 |
+
for idx in selected_files:
|
646 |
+
with st.spinner(f"Downloading {files[idx]['filename']}..."):
|
647 |
+
path = await dm.download_file(
|
648 |
+
files[idx],
|
649 |
+
download_dir,
|
650 |
+
url
|
651 |
+
)
|
652 |
+
if path:
|
653 |
+
paths.append(path)
|
654 |
+
return paths
|
655 |
|
656 |
+
downloaded = asyncio.run(download_files())
|
657 |
+
if downloaded:
|
658 |
+
st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
|
659 |
+
else:
|
660 |
+
st.warning("No files found.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
661 |
|
662 |
elif mode == "Bing Search":
|
663 |
st.header("Bing Search Mode")
|
|
|
674 |
num_results=num_results
|
675 |
) as dm:
|
676 |
with st.spinner("Searching..."):
|
677 |
+
urls = await dm.search_bing()
|
678 |
+
if urls:
|
679 |
+
st.success(f"Found {len(urls)} results!")
|
680 |
+
for i, url in enumerate(urls, 1):
|
681 |
+
with st.expander(f"Result {i}: {url}", expanded=i==1):
|
682 |
+
if st.button(f"Deep Search This Result {i}"):
|
683 |
+
files = await dm.deep_search(
|
684 |
+
url=url,
|
685 |
+
custom_ext_list=custom_extensions.split(',') if custom_extensions else []
|
686 |
+
)
|
687 |
+
if files:
|
688 |
+
st.session_state.discovered_files = files
|
689 |
+
st.session_state.current_url = url
|
690 |
+
st.success(f"Found {len(files)} files!")
|
691 |
+
|
692 |
+
# Display and download section
|
693 |
+
for file in files:
|
694 |
+
st.write(f"- {file['filename']} ({file['size']})")
|
695 |
+
|
696 |
+
selected_files = st.multiselect(
|
697 |
+
"Select files to download",
|
698 |
+
range(len(files)),
|
699 |
+
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
|
700 |
+
)
|
701 |
+
|
702 |
+
if selected_files:
|
703 |
+
download_dir = st.text_input("Download Directory", value="./downloads")
|
704 |
+
if st.button("Download Selected Files"):
|
705 |
+
paths = []
|
706 |
+
for idx in selected_files:
|
707 |
+
with st.spinner(f"Downloading {files[idx]['filename']}..."):
|
708 |
+
path = await dm.download_file(
|
709 |
+
files[idx],
|
710 |
+
download_dir,
|
711 |
+
url
|
712 |
+
)
|
713 |
+
if path:
|
714 |
+
paths.append(path)
|
715 |
+
if paths:
|
716 |
+
st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
|
717 |
+
else:
|
718 |
+
st.warning("No files found on this page.")
|
719 |
+
else:
|
720 |
+
st.warning("No search results found.")
|
721 |
|
722 |
+
asyncio.run(run_search())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
|
724 |
else: # PDF Summarizer mode
|
725 |
st.header("PDF Summarizer")
|
|
|
727 |
|
728 |
if st.button("Summarize"):
|
729 |
if pdf_url:
|
730 |
+
with st.spinner("Generating summary..."):
|
731 |
+
summary = summarize_pdf_url(pdf_url)
|
732 |
+
st.write("Summary:")
|
733 |
+
st.write(summary)
|
734 |
|
735 |
if __name__ == "__main__":
|
736 |
+
try:
|
737 |
+
main()
|
738 |
+
except Exception as e:
|
739 |
+
st.error(f"An error occurred: {str(e)}")
|
740 |
+
logger.error(f"Application error: {str(e)}", exc_info=True)
|