Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ from spacy.language import Language
|
|
25 |
import google_auth_oauthlib.flow
|
26 |
import googleapiclient.discovery
|
27 |
import google.auth.transport.requests
|
|
|
28 |
# -------------------- Logging Setup --------------------
|
29 |
logging.basicConfig(
|
30 |
filename='advanced_download_log.txt',
|
@@ -447,7 +448,60 @@ class DownloadManager:
|
|
447 |
logger.error(f"Error downloading {file_url}: {e}")
|
448 |
return None
|
449 |
|
450 |
-
async def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
if not custom_ext_list:
|
452 |
custom_ext_list = []
|
453 |
|
@@ -464,7 +518,7 @@ class DownloadManager:
|
|
464 |
|
465 |
# Get and search sublinks
|
466 |
progress_text.text("Getting sublinks...")
|
467 |
-
sublinks = await self.get_sublinks(url, sublink_limit)
|
468 |
total_links = len(sublinks)
|
469 |
|
470 |
progress_text.text(f"Found {total_links} sublinks to process")
|
@@ -474,20 +528,39 @@ class DownloadManager:
|
|
474 |
return main_files
|
475 |
|
476 |
# Process sublinks
|
477 |
-
all_files = main_files
|
478 |
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
485 |
all_files.extend(sub_files)
|
486 |
-
|
487 |
-
# Update count in real-time
|
488 |
file_count_text.text(f"Found {len(all_files)} total files")
|
489 |
-
|
490 |
-
# Make results unique
|
491 |
seen_urls = set()
|
492 |
unique_files = []
|
493 |
|
@@ -501,41 +574,25 @@ class DownloadManager:
|
|
501 |
file_count_text.text(f"Found {final_count} unique files")
|
502 |
progress_bar.progress(1.0)
|
503 |
|
|
|
|
|
|
|
504 |
return unique_files
|
505 |
|
506 |
except Exception as e:
|
507 |
logger.error(f"Deep search error: {e}")
|
508 |
progress_text.text(f"Error during deep search: {str(e)}")
|
509 |
return []
|
|
|
510 |
finally:
|
511 |
# Clean up progress indicators after a delay
|
512 |
await asyncio.sleep(2)
|
513 |
-
|
514 |
progress_text.empty()
|
515 |
progress_bar.empty()
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
await self.page.goto(url, timeout=30000)
|
520 |
-
content = await self.page.content()
|
521 |
-
soup = BeautifulSoup(content, 'html.parser')
|
522 |
-
|
523 |
-
parsed_base = urlparse(url)
|
524 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
525 |
-
|
526 |
-
links = set()
|
527 |
-
for a in soup.find_all('a', href=True):
|
528 |
-
href = a['href'].strip()
|
529 |
-
if href.startswith('http'):
|
530 |
-
links.add(href)
|
531 |
-
elif href.startswith('/'):
|
532 |
-
links.add(f"{base_url}{href}")
|
533 |
-
|
534 |
-
return list(links)[:limit]
|
535 |
-
|
536 |
-
except Exception as e:
|
537 |
-
logger.error(f"Error getting sublinks: {e}")
|
538 |
-
return []
|
539 |
def main():
|
540 |
if 'initialized' not in st.session_state:
|
541 |
st.session_state.initialized = True
|
@@ -547,11 +604,7 @@ def main():
|
|
547 |
st.title("Advanced File Downloader")
|
548 |
|
549 |
# Sidebar settings
|
550 |
-
with st.
|
551 |
-
st.header("Settings")
|
552 |
-
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
553 |
-
|
554 |
-
with st.expander("Advanced Options"):
|
555 |
custom_extensions = st.text_input(
|
556 |
"Custom File Extensions",
|
557 |
placeholder=".csv, .txt, .epub"
|
@@ -561,8 +614,17 @@ def main():
|
|
561 |
min_value=1,
|
562 |
max_value=10000,
|
563 |
value=100,
|
|
|
564 |
help="Maximum number of sublinks to process from the main page"
|
565 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
use_proxy = st.checkbox("Use Proxy")
|
567 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
568 |
|
|
|
25 |
import google_auth_oauthlib.flow
|
26 |
import googleapiclient.discovery
|
27 |
import google.auth.transport.requests
|
28 |
+
from async_timeout import timeout as async_timeout
|
29 |
# -------------------- Logging Setup --------------------
|
30 |
logging.basicConfig(
|
31 |
filename='advanced_download_log.txt',
|
|
|
448 |
logger.error(f"Error downloading {file_url}: {e}")
|
449 |
return None
|
450 |
|
451 |
+
async def search_bing(self):
|
452 |
+
if not self.query:
|
453 |
+
return [], []
|
454 |
+
|
455 |
+
search_query = self.query
|
456 |
+
if "filetype:pdf" not in search_query.lower():
|
457 |
+
search_query += " filetype:pdf"
|
458 |
+
|
459 |
+
search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
|
460 |
+
|
461 |
+
try:
|
462 |
+
await self.page.goto(search_url, timeout=30000)
|
463 |
+
await self.page.wait_for_selector('li.b_algo', timeout=30000)
|
464 |
+
|
465 |
+
results = []
|
466 |
+
elements = await self.page.query_selector_all('li.b_algo')
|
467 |
+
|
468 |
+
for element in elements:
|
469 |
+
link = await element.query_selector('h2 a')
|
470 |
+
if link:
|
471 |
+
url = await link.get_attribute('href')
|
472 |
+
if url:
|
473 |
+
results.append(url)
|
474 |
+
|
475 |
+
return results[:self.num_results]
|
476 |
+
|
477 |
+
except Exception as e:
|
478 |
+
logger.error(f"Bing search error: {e}")
|
479 |
+
return []
|
480 |
+
|
481 |
+
async def get_sublinks(self, url, limit=100):
|
482 |
+
try:
|
483 |
+
await self.page.goto(url, timeout=30000)
|
484 |
+
content = await self.page.content()
|
485 |
+
soup = BeautifulSoup(content, 'html.parser')
|
486 |
+
|
487 |
+
parsed_base = urlparse(url)
|
488 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
489 |
+
|
490 |
+
links = set()
|
491 |
+
for a in soup.find_all('a', href=True):
|
492 |
+
href = a['href'].strip()
|
493 |
+
if href.startswith('http'):
|
494 |
+
links.add(href)
|
495 |
+
elif href.startswith('/'):
|
496 |
+
links.add(f"{base_url}{href}")
|
497 |
+
|
498 |
+
return list(links)[:limit]
|
499 |
+
|
500 |
+
except Exception as e:
|
501 |
+
logger.error(f"Error getting sublinks: {e}")
|
502 |
+
return []
|
503 |
+
|
504 |
+
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
|
505 |
if not custom_ext_list:
|
506 |
custom_ext_list = []
|
507 |
|
|
|
518 |
|
519 |
# Get and search sublinks
|
520 |
progress_text.text("Getting sublinks...")
|
521 |
+
sublinks = await self.get_sublinks(url, limit=sublink_limit)
|
522 |
total_links = len(sublinks)
|
523 |
|
524 |
progress_text.text(f"Found {total_links} sublinks to process")
|
|
|
528 |
return main_files
|
529 |
|
530 |
# Process sublinks
|
531 |
+
all_files = main_files.copy()
|
532 |
|
533 |
+
# Create semaphore for concurrent processing
|
534 |
+
sem = asyncio.Semaphore(10)
|
535 |
+
|
536 |
+
async def process_sublink(sublink, index):
|
537 |
+
async with sem:
|
538 |
+
try:
|
539 |
+
progress = index/total_links
|
540 |
+
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
541 |
+
progress_bar.progress(progress)
|
542 |
+
|
543 |
+
# Set timeout for this sublink
|
544 |
+
async with async_timeout.timeout(timeout):
|
545 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
546 |
+
return sub_files
|
547 |
+
except asyncio.TimeoutError:
|
548 |
+
logger.warning(f"Timeout processing sublink: {sublink}")
|
549 |
+
return []
|
550 |
+
except Exception as e:
|
551 |
+
logger.error(f"Error processing sublink {sublink}: {e}")
|
552 |
+
return []
|
553 |
+
|
554 |
+
# Process sublinks with concurrent tasks
|
555 |
+
tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
|
556 |
+
sub_results = await asyncio.gather(*tasks)
|
557 |
+
|
558 |
+
# Combine all results
|
559 |
+
for sub_files in sub_results:
|
560 |
all_files.extend(sub_files)
|
|
|
|
|
561 |
file_count_text.text(f"Found {len(all_files)} total files")
|
562 |
+
|
563 |
+
# Make results unique based on URLs
|
564 |
seen_urls = set()
|
565 |
unique_files = []
|
566 |
|
|
|
574 |
file_count_text.text(f"Found {final_count} unique files")
|
575 |
progress_bar.progress(1.0)
|
576 |
|
577 |
+
# Sort files by name for consistency
|
578 |
+
unique_files.sort(key=lambda x: x['filename'].lower())
|
579 |
+
|
580 |
return unique_files
|
581 |
|
582 |
except Exception as e:
|
583 |
logger.error(f"Deep search error: {e}")
|
584 |
progress_text.text(f"Error during deep search: {str(e)}")
|
585 |
return []
|
586 |
+
|
587 |
finally:
|
588 |
# Clean up progress indicators after a delay
|
589 |
await asyncio.sleep(2)
|
590 |
+
try:
|
591 |
progress_text.empty()
|
592 |
progress_bar.empty()
|
593 |
+
file_count_text.empty()
|
594 |
+
except:
|
595 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
def main():
|
597 |
if 'initialized' not in st.session_state:
|
598 |
st.session_state.initialized = True
|
|
|
604 |
st.title("Advanced File Downloader")
|
605 |
|
606 |
# Sidebar settings
|
607 |
+
with st.expander("Advanced Options"):
|
|
|
|
|
|
|
|
|
608 |
custom_extensions = st.text_input(
|
609 |
"Custom File Extensions",
|
610 |
placeholder=".csv, .txt, .epub"
|
|
|
614 |
min_value=1,
|
615 |
max_value=10000,
|
616 |
value=100,
|
617 |
+
step=50,
|
618 |
help="Maximum number of sublinks to process from the main page"
|
619 |
)
|
620 |
+
sublink_timeout = st.number_input(
|
621 |
+
"Search Timeout (seconds per sublink)",
|
622 |
+
min_value=1,
|
623 |
+
max_value=3000,
|
624 |
+
value=30,
|
625 |
+
step=5,
|
626 |
+
help="Maximum time to spend searching each sublink"
|
627 |
+
)
|
628 |
use_proxy = st.checkbox("Use Proxy")
|
629 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
630 |
|