euler314 commited on
Commit
96b5b31
·
verified ·
1 Parent(s): 7e696de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -42
app.py CHANGED
@@ -25,6 +25,7 @@ from spacy.language import Language
25
  import google_auth_oauthlib.flow
26
  import googleapiclient.discovery
27
  import google.auth.transport.requests
 
28
  # -------------------- Logging Setup --------------------
29
  logging.basicConfig(
30
  filename='advanced_download_log.txt',
@@ -447,7 +448,60 @@ class DownloadManager:
447
  logger.error(f"Error downloading {file_url}: {e}")
448
  return None
449
 
450
- async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  if not custom_ext_list:
452
  custom_ext_list = []
453
 
@@ -464,7 +518,7 @@ class DownloadManager:
464
 
465
  # Get and search sublinks
466
  progress_text.text("Getting sublinks...")
467
- sublinks = await self.get_sublinks(url, sublink_limit)
468
  total_links = len(sublinks)
469
 
470
  progress_text.text(f"Found {total_links} sublinks to process")
@@ -474,20 +528,39 @@ class DownloadManager:
474
  return main_files
475
 
476
  # Process sublinks
477
- all_files = main_files
478
 
479
- for i, sublink in enumerate(sublinks, 1):
480
- progress = i/total_links
481
- progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
482
- progress_bar.progress(progress)
483
-
484
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  all_files.extend(sub_files)
486
-
487
- # Update count in real-time
488
  file_count_text.text(f"Found {len(all_files)} total files")
489
-
490
- # Make results unique
491
  seen_urls = set()
492
  unique_files = []
493
 
@@ -501,41 +574,25 @@ class DownloadManager:
501
  file_count_text.text(f"Found {final_count} unique files")
502
  progress_bar.progress(1.0)
503
 
 
 
 
504
  return unique_files
505
 
506
  except Exception as e:
507
  logger.error(f"Deep search error: {e}")
508
  progress_text.text(f"Error during deep search: {str(e)}")
509
  return []
 
510
  finally:
511
  # Clean up progress indicators after a delay
512
  await asyncio.sleep(2)
513
- if not st.session_state.get('keep_progress', False):
514
  progress_text.empty()
515
  progress_bar.empty()
516
-
517
- async def get_sublinks(self, url, limit=100):
518
- try:
519
- await self.page.goto(url, timeout=30000)
520
- content = await self.page.content()
521
- soup = BeautifulSoup(content, 'html.parser')
522
-
523
- parsed_base = urlparse(url)
524
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
525
-
526
- links = set()
527
- for a in soup.find_all('a', href=True):
528
- href = a['href'].strip()
529
- if href.startswith('http'):
530
- links.add(href)
531
- elif href.startswith('/'):
532
- links.add(f"{base_url}{href}")
533
-
534
- return list(links)[:limit]
535
-
536
- except Exception as e:
537
- logger.error(f"Error getting sublinks: {e}")
538
- return []
539
  def main():
540
  if 'initialized' not in st.session_state:
541
  st.session_state.initialized = True
@@ -547,11 +604,7 @@ def main():
547
  st.title("Advanced File Downloader")
548
 
549
  # Sidebar settings
550
- with st.sidebar:
551
- st.header("Settings")
552
- mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
553
-
554
- with st.expander("Advanced Options"):
555
  custom_extensions = st.text_input(
556
  "Custom File Extensions",
557
  placeholder=".csv, .txt, .epub"
@@ -561,8 +614,17 @@ def main():
561
  min_value=1,
562
  max_value=10000,
563
  value=100,
 
564
  help="Maximum number of sublinks to process from the main page"
565
  )
 
 
 
 
 
 
 
 
566
  use_proxy = st.checkbox("Use Proxy")
567
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
568
 
 
25
  import google_auth_oauthlib.flow
26
  import googleapiclient.discovery
27
  import google.auth.transport.requests
28
+ from async_timeout import timeout as async_timeout
29
  # -------------------- Logging Setup --------------------
30
  logging.basicConfig(
31
  filename='advanced_download_log.txt',
 
448
  logger.error(f"Error downloading {file_url}: {e}")
449
  return None
450
 
451
+ async def search_bing(self):
452
+ if not self.query:
453
+ return [], []
454
+
455
+ search_query = self.query
456
+ if "filetype:pdf" not in search_query.lower():
457
+ search_query += " filetype:pdf"
458
+
459
+ search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
460
+
461
+ try:
462
+ await self.page.goto(search_url, timeout=30000)
463
+ await self.page.wait_for_selector('li.b_algo', timeout=30000)
464
+
465
+ results = []
466
+ elements = await self.page.query_selector_all('li.b_algo')
467
+
468
+ for element in elements:
469
+ link = await element.query_selector('h2 a')
470
+ if link:
471
+ url = await link.get_attribute('href')
472
+ if url:
473
+ results.append(url)
474
+
475
+ return results[:self.num_results]
476
+
477
+ except Exception as e:
478
+ logger.error(f"Bing search error: {e}")
479
+ return []
480
+
481
+ async def get_sublinks(self, url, limit=100):
482
+ try:
483
+ await self.page.goto(url, timeout=30000)
484
+ content = await self.page.content()
485
+ soup = BeautifulSoup(content, 'html.parser')
486
+
487
+ parsed_base = urlparse(url)
488
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
489
+
490
+ links = set()
491
+ for a in soup.find_all('a', href=True):
492
+ href = a['href'].strip()
493
+ if href.startswith('http'):
494
+ links.add(href)
495
+ elif href.startswith('/'):
496
+ links.add(f"{base_url}{href}")
497
+
498
+ return list(links)[:limit]
499
+
500
+ except Exception as e:
501
+ logger.error(f"Error getting sublinks: {e}")
502
+ return []
503
+
504
+ async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
505
  if not custom_ext_list:
506
  custom_ext_list = []
507
 
 
518
 
519
  # Get and search sublinks
520
  progress_text.text("Getting sublinks...")
521
+ sublinks = await self.get_sublinks(url, limit=sublink_limit)
522
  total_links = len(sublinks)
523
 
524
  progress_text.text(f"Found {total_links} sublinks to process")
 
528
  return main_files
529
 
530
  # Process sublinks
531
+ all_files = main_files.copy()
532
 
533
+ # Create semaphore for concurrent processing
534
+ sem = asyncio.Semaphore(10)
535
+
536
+ async def process_sublink(sublink, index):
537
+ async with sem:
538
+ try:
539
+ progress = index/total_links
540
+ progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
541
+ progress_bar.progress(progress)
542
+
543
+ # Set timeout for this sublink
544
+ async with async_timeout.timeout(timeout):
545
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
546
+ return sub_files
547
+ except asyncio.TimeoutError:
548
+ logger.warning(f"Timeout processing sublink: {sublink}")
549
+ return []
550
+ except Exception as e:
551
+ logger.error(f"Error processing sublink {sublink}: {e}")
552
+ return []
553
+
554
+ # Process sublinks with concurrent tasks
555
+ tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)]
556
+ sub_results = await asyncio.gather(*tasks)
557
+
558
+ # Combine all results
559
+ for sub_files in sub_results:
560
  all_files.extend(sub_files)
 
 
561
  file_count_text.text(f"Found {len(all_files)} total files")
562
+
563
+ # Make results unique based on URLs
564
  seen_urls = set()
565
  unique_files = []
566
 
 
574
  file_count_text.text(f"Found {final_count} unique files")
575
  progress_bar.progress(1.0)
576
 
577
+ # Sort files by name for consistency
578
+ unique_files.sort(key=lambda x: x['filename'].lower())
579
+
580
  return unique_files
581
 
582
  except Exception as e:
583
  logger.error(f"Deep search error: {e}")
584
  progress_text.text(f"Error during deep search: {str(e)}")
585
  return []
586
+
587
  finally:
588
  # Clean up progress indicators after a delay
589
  await asyncio.sleep(2)
590
+ try:
591
  progress_text.empty()
592
  progress_bar.empty()
593
+ file_count_text.empty()
594
+ except:
595
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  def main():
597
  if 'initialized' not in st.session_state:
598
  st.session_state.initialized = True
 
604
  st.title("Advanced File Downloader")
605
 
606
  # Sidebar settings
607
+ with st.expander("Advanced Options"):
 
 
 
 
608
  custom_extensions = st.text_input(
609
  "Custom File Extensions",
610
  placeholder=".csv, .txt, .epub"
 
614
  min_value=1,
615
  max_value=10000,
616
  value=100,
617
+ step=50,
618
  help="Maximum number of sublinks to process from the main page"
619
  )
620
+ sublink_timeout = st.number_input(
621
+ "Search Timeout (seconds per sublink)",
622
+ min_value=1,
623
+ max_value=3000,
624
+ value=30,
625
+ step=5,
626
+ help="Maximum time to spend searching each sublink"
627
+ )
628
  use_proxy = st.checkbox("Use Proxy")
629
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
630