euler314 commited on
Commit
ac436f8
·
verified ·
1 Parent(s): 9c2fa03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -496
app.py CHANGED
@@ -26,6 +26,18 @@ import google_auth_oauthlib.flow
26
  import googleapiclient.discovery
27
  import google.auth.transport.requests
28
  from async_timeout import timeout as async_timeout
 
 
 
 
 
 
 
 
 
 
 
 
29
  # -------------------- Logging Setup --------------------
30
  logging.basicConfig(
31
  filename='advanced_download_log.txt',
@@ -33,7 +45,7 @@ logging.basicConfig(
33
  format='%(asctime)s - %(levelname)s - %(message)s'
34
  )
35
  logger = logging.getLogger(__name__)
36
- # Google OAuth Configuration
37
  GOOGLE_OAUTH_CONFIG = {
38
  "web": {
39
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
@@ -49,49 +61,22 @@ GOOGLE_OAUTH_CONFIG = {
49
  # Playwright Setup
50
  def install_playwright_dependencies():
51
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
52
- os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
53
- try:
54
- subprocess.run(['apt-get', 'update', '-y'], check=True)
55
- packages = [
56
- 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
57
- 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
58
- 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
59
- ]
60
- subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
61
- os.makedirs('/usr/lib/playwright', exist_ok=True)
62
- symlinks = {
63
- 'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
64
- 'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
65
- 'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
66
- 'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
67
- 'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
68
- 'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
69
- 'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
70
- 'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
71
- 'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
72
- 'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
73
- }
74
- for link_name, target in symlinks.items():
75
- link_path = os.path.join('/usr/lib/playwright', link_name)
76
- if not os.path.exists(link_path):
77
- os.symlink(target, link_path)
78
- subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
79
- browser_path = os.path.expanduser("~/.cache/ms-playwright")
80
- os.makedirs(browser_path, exist_ok=True)
81
- subprocess.run(['chmod', '-R', '755', browser_path], check=True)
82
- except subprocess.CalledProcessError as e:
83
- print(f"Error installing dependencies: {e}")
84
- except Exception as e:
85
- print(f"Error: {e}")
86
 
87
- # Initialize Playwright
88
  install_playwright_dependencies()
89
 
90
  # Model Loading
91
  @st.cache_resource
92
  def load_models():
93
  try:
94
- # Try to load spaCy model
95
  try:
96
  nlp = spacy.load("en_core_web_sm")
97
  except OSError:
@@ -99,43 +84,26 @@ def load_models():
99
  spacy.cli.download("en_core_web_sm")
100
  nlp = spacy.load("en_core_web_sm")
101
 
102
- # Load SentenceTransformer with offline handling
103
  try:
104
- from sentence_transformers import SentenceTransformer
105
- model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
106
- cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
107
- if os.path.exists(os.path.join(cache_dir, model_name)):
108
- semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
109
- else:
110
- st.warning(f"Downloading SentenceTransformer model {model_name}...")
111
- semantic_model = SentenceTransformer(model_name)
112
  except Exception as e:
113
  st.error(f"Error loading SentenceTransformer: {e}")
114
  semantic_model = None
115
 
116
- # Load Transformers pipeline with offline handling
117
  try:
118
- from transformers import pipeline
119
- model_name = "facebook/bart-large-cnn"
120
- cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
121
- if os.path.exists(os.path.join(cache_dir, model_name)):
122
- summarizer = pipeline("summarization", model=model_name)
123
- else:
124
- st.warning(f"Downloading Transformer model {model_name}...")
125
- summarizer = pipeline("summarization")
126
  except Exception as e:
127
  st.error(f"Error loading Transformers: {e}")
128
  summarizer = None
129
 
130
  return nlp, semantic_model, summarizer
131
-
132
  except Exception as e:
133
  st.error(f"Error loading models: {e}")
134
  return None, None, None
135
 
136
- # Initialize models
137
- with st.spinner("Loading models..."):
138
- nlp_model, semantic_model, summarizer = load_models()
139
 
140
  # Utility Functions
141
  def get_random_user_agent():
@@ -157,11 +125,9 @@ def sizeof_fmt(num, suffix='B'):
157
  def create_zip_file(file_paths, output_dir):
158
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
159
  zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
160
-
161
  with zipfile.ZipFile(zip_path, 'w') as zipf:
162
  for file_path in file_paths:
163
  zipf.write(file_path, os.path.basename(file_path))
164
-
165
  return zip_path
166
 
167
  # Google Drive Functions
@@ -197,16 +163,23 @@ def exchange_code_for_credentials(auth_code):
197
  except Exception as e:
198
  return None, f"Error during token exchange: {e}"
199
 
200
- def google_drive_upload(zip_path: str, credentials):
201
  try:
202
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
203
- file_metadata = {'name': os.path.basename(zip_path)}
204
- media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True)
 
 
205
  created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
206
  return created.get("id", "")
207
  except Exception as e:
208
  return f"Error uploading to Drive: {str(e)}"
209
- # DownloadManager Class
 
 
 
 
 
210
  # DownloadManager Class
211
  class DownloadManager:
212
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
@@ -234,7 +207,6 @@ class DownloadManager:
234
  }
235
  if self.use_proxy and self.proxy:
236
  opts["proxy"] = {"server": self.proxy}
237
-
238
  self.browser = await self.playwright.chromium.launch(**opts)
239
  self.context = await self.browser.new_context(user_agent=get_random_user_agent())
240
  self.page = await self.context.new_page()
@@ -257,14 +229,11 @@ class DownloadManager:
257
  search_url = f"https://www.bing.com/search?q={self.query}"
258
  await self.page.goto(search_url, timeout=30000)
259
  await self.page.wait_for_load_state('networkidle')
260
-
261
- # Extract search result links
262
  links = await self.page.query_selector_all("li.b_algo h2 a")
263
  for link in links[:self.num_results]:
264
  href = await link.get_attribute('href')
265
  if href:
266
  urls.append(href)
267
-
268
  return urls
269
  except Exception as e:
270
  logger.error(f"Error searching Bing: {e}")
@@ -335,7 +304,8 @@ class DownloadManager:
335
  soup = BeautifulSoup(content, 'html.parser')
336
 
337
  default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
338
- '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
 
339
  all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
340
 
341
  parsed_base = urlparse(final_url)
@@ -344,11 +314,8 @@ class DownloadManager:
344
  for a in soup.find_all('a', href=True):
345
  href = a['href'].strip()
346
 
347
- # Handle PHP scripts and redirects
348
  if '.php' in href.lower() or 'download' in href.lower():
349
- full_url = href if href.startswith('http') else (
350
- f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
351
- )
352
  real_url = await self.extract_real_download_url(full_url)
353
  if real_url and real_url != full_url:
354
  found_files.append({
@@ -359,17 +326,12 @@ class DownloadManager:
359
  })
360
  continue
361
 
362
- # Handle direct file links
363
  if any(href.lower().endswith(ext) for ext in all_exts):
364
- file_url = href if href.startswith('http') else (
365
- f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
366
- )
367
-
368
  size_str = await self.get_file_size(file_url)
369
  meta = {}
370
  if file_url.lower().endswith('.pdf'):
371
  meta = await self.get_pdf_metadata(file_url)
372
-
373
  found_files.append({
374
  'url': file_url,
375
  'filename': os.path.basename(file_url.split('?')[0]),
@@ -385,7 +347,6 @@ class DownloadManager:
385
  if match:
386
  file_id = match.group(1)
387
  break
388
-
389
  if file_id:
390
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
391
  filename = file_id
@@ -396,7 +357,6 @@ class DownloadManager:
396
  mt = re.search(r'filename\*?="?([^";]+)', cd)
397
  if mt:
398
  filename = mt.group(1).strip('"').strip()
399
-
400
  found_files.append({
401
  'url': direct_url,
402
  'filename': filename,
@@ -406,14 +366,12 @@ class DownloadManager:
406
  except Exception as e:
407
  logger.error(f"Error processing Google Drive link: {e}")
408
 
409
- # Make results unique based on URLs
410
  seen_urls = set()
411
  unique_files = []
412
  for f in found_files:
413
  if f['url'] not in seen_urls:
414
  seen_urls.add(f['url'])
415
  unique_files.append(f)
416
-
417
  return unique_files
418
  except Exception as e:
419
  logger.error(f"Error extracting files from {url}: {e}")
@@ -428,106 +386,29 @@ class DownloadManager:
428
  while os.path.exists(path):
429
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
430
  counter += 1
431
-
432
  os.makedirs(save_dir, exist_ok=True)
433
-
434
  try:
435
  if "drive.google.com" in file_url:
436
  import gdown
437
- try:
438
- st.write(f"Downloading from Google Drive: {fname}")
439
-
440
- # Determine file extension or use a default if none available
441
- if not ext or ext == "":
442
- # Try to determine file type from content-type header
443
- async with self.context.new_page() as page:
444
- response = await page.request.head(file_url, timeout=15000)
445
- content_type = response.headers.get('Content-Type', '')
446
-
447
- # Map content types to extensions
448
- extension_map = {
449
- 'application/pdf': '.pdf',
450
- 'image/jpeg': '.jpg',
451
- 'image/png': '.png',
452
- 'application/msword': '.doc',
453
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
454
- 'application/zip': '.zip',
455
- 'text/plain': '.txt',
456
- 'application/vnd.ms-excel': '.xls',
457
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
458
- 'video/mp4': '.mp4',
459
- 'audio/mpeg': '.mp3',
460
- 'video/x-msvideo': '.avi',
461
- 'video/x-matroska': '.mkv'
462
- }
463
-
464
- # Get extension from content type or use .bin as fallback
465
- ext = extension_map.get(content_type.split(';')[0], '.bin')
466
- path = os.path.join(save_dir, f"{base}{ext}")
467
-
468
- # Handle name collisions
469
- counter = 1
470
- while os.path.exists(path):
471
- path = os.path.join(save_dir, f"{base}_{counter}{ext}")
472
- counter += 1
473
-
474
- output = gdown.download(file_url, path, quiet=False)
475
- if output:
476
- return path
477
- return None
478
- except Exception as e:
479
- logger.error(f"Google Drive download error: {e}")
480
- return None
481
-
482
  async with self.context.new_page() as page:
483
- st.write(f"Downloading: {fname}")
484
-
485
  headers = {
486
  'Accept': '*/*',
487
  'Accept-Encoding': 'gzip, deflate, br',
488
  'Referer': referer
489
  }
490
-
491
  response = await page.request.get(file_url, headers=headers, timeout=30000)
492
-
493
  if response.status == 200:
494
  content = await response.body()
495
-
496
- # Check if we need to add an extension based on content type
497
- if not ext or ext == "":
498
- content_type = response.headers.get('Content-Type', '')
499
- extension_map = {
500
- 'application/pdf': '.pdf',
501
- 'image/jpeg': '.jpg',
502
- 'image/png': '.png',
503
- 'application/msword': '.doc',
504
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
505
- 'application/zip': '.zip',
506
- 'text/plain': '.txt',
507
- 'application/vnd.ms-excel': '.xls',
508
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
509
- 'video/mp4': '.mp4',
510
- 'audio/mpeg': '.mp3',
511
- 'video/x-msvideo': '.avi',
512
- 'video/x-matroska': '.mkv'
513
- }
514
-
515
- ext = extension_map.get(content_type.split(';')[0], '.bin')
516
- path = os.path.join(save_dir, f"{base}{ext}")
517
-
518
- # Handle name collisions again
519
- counter = 1
520
- while os.path.exists(path):
521
- path = os.path.join(save_dir, f"{base}_{counter}{ext}")
522
- counter += 1
523
-
524
  with open(path, 'wb') as f:
525
  f.write(content)
526
  return path
527
  else:
528
  logger.error(f"Download failed with status {response.status}: {file_url}")
529
  return None
530
-
531
  except Exception as e:
532
  logger.error(f"Error downloading {file_url}: {e}")
533
  return None
@@ -535,65 +416,45 @@ class DownloadManager:
535
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
536
  if not custom_ext_list:
537
  custom_ext_list = []
538
-
539
  progress_text = st.empty()
540
  progress_bar = st.progress(0)
541
  file_count_text = st.empty()
542
-
543
  try:
544
- # Search main page
545
  progress_text.text("Analyzing main page...")
546
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
547
  initial_count = len(main_files)
548
  file_count_text.text(f"Found {initial_count} files on main page")
549
-
550
- # Get and search sublinks
551
  progress_text.text("Getting sublinks...")
552
  sublinks = await self.get_sublinks(url, sublink_limit)
553
  total_links = len(sublinks)
554
-
555
  progress_text.text(f"Found {total_links} sublinks to process")
556
-
557
  if not sublinks:
558
  progress_bar.progress(1.0)
559
  return main_files
560
-
561
- # Process sublinks
562
  all_files = main_files
563
-
564
  for i, sublink in enumerate(sublinks, 1):
565
- progress = i/total_links
566
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
567
  progress_bar.progress(progress)
568
-
569
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
570
  all_files.extend(sub_files)
571
-
572
- # Update count in real-time
573
  file_count_text.text(f"Found {len(all_files)} total files")
574
-
575
- # Make results unique
576
  seen_urls = set()
577
  unique_files = []
578
-
579
  for f in all_files:
580
  if f['url'] not in seen_urls:
581
  seen_urls.add(f['url'])
582
  unique_files.append(f)
583
-
584
  final_count = len(unique_files)
585
  progress_text.text(f"Deep search complete!")
586
  file_count_text.text(f"Found {final_count} unique files")
587
  progress_bar.progress(1.0)
588
-
589
  return unique_files
590
-
591
  except Exception as e:
592
  logger.error(f"Deep search error: {e}")
593
  progress_text.text(f"Error during deep search: {str(e)}")
594
  return []
595
  finally:
596
- # Clean up progress indicators after a delay
597
  await asyncio.sleep(2)
598
  if not st.session_state.get('keep_progress', False):
599
  progress_text.empty()
@@ -604,10 +465,8 @@ class DownloadManager:
604
  await self.page.goto(url, timeout=30000)
605
  content = await self.page.content()
606
  soup = BeautifulSoup(content, 'html.parser')
607
-
608
  parsed_base = urlparse(url)
609
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
610
-
611
  links = set()
612
  for a in soup.find_all('a', href=True):
613
  href = a['href'].strip()
@@ -615,56 +474,48 @@ class DownloadManager:
615
  links.add(href)
616
  elif href.startswith('/'):
617
  links.add(f"{base_url}{href}")
618
-
619
  return list(links)[:limit]
620
-
621
  except Exception as e:
622
  logger.error(f"Error getting sublinks: {e}")
623
  return []
624
 
625
- def safe_rerun():
626
- """Safely rerun the app if experimental_rerun is available."""
627
- if hasattr(st, "experimental_rerun"):
628
- st.experimental_rerun()
 
629
 
 
 
 
 
630
 
 
 
 
 
 
 
 
631
  def main():
632
- # Initialize session state on first run
633
  if 'initialized' not in st.session_state:
634
  st.session_state.initialized = True
635
  st.session_state.discovered_files = []
636
  st.session_state.current_url = None
637
  st.session_state.google_creds = None
638
  st.session_state.selected_files = []
639
- st.session_state.do_deep_search = False # Add this
640
- st.session_state.deep_search_url = None # Add this
641
- st.session_state.search_results = [] # Add this
642
 
643
  st.title("Advanced File Downloader")
644
-
645
- # Sidebar configuration
646
  with st.sidebar:
647
  mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
648
  with st.expander("Advanced Options", expanded=True):
649
- custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input")
650
- max_sublinks = st.number_input(
651
- "Maximum Sublinks to Process",
652
- min_value=1,
653
- max_value=100000,
654
- value=10000,
655
- step=50,
656
- help="Maximum number of sublinks to process from the main page",
657
- key="max_sublinks_input"
658
- )
659
- sublink_timeout = st.number_input(
660
- "Search Timeout (seconds per sublink)",
661
- min_value=1,
662
- max_value=3000,
663
- value=30,
664
- step=5,
665
- help="Maximum time to spend searching each sublink",
666
- key="timeout_input"
667
- )
668
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
669
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
670
  with st.expander("Google Drive Integration", expanded=False):
@@ -676,158 +527,54 @@ def main():
676
  creds, msg = exchange_code_for_credentials(auth_code)
677
  st.session_state.google_creds = creds
678
  st.write(msg)
679
-
680
- # Manual URL mode
681
  if mode == "Manual URL":
682
  st.header("Manual URL Mode")
683
  url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
684
-
685
  col1, col2 = st.columns([3, 1])
686
  with col1:
687
  if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
688
  if url:
 
 
 
 
689
  async def run_deep_search():
690
- try:
691
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
692
- files = await dm.deep_search(
693
- url=url,
694
- custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
695
- sublink_limit=int(max_sublinks),
696
- timeout=int(sublink_timeout)
697
- )
698
- return files
699
- except Exception as e:
700
- st.error(f"Error during deep search: {str(e)}")
701
- return None
702
-
703
  files = asyncio.run(run_deep_search())
704
  if files:
705
- # Save all discovered files—even duplicates
706
  st.session_state.discovered_files = files
707
  st.session_state.current_url = url
708
  st.success(f"Found {len(files)} files!")
709
-
710
- # File selection block (Select/Clear)
711
- col1, col2 = st.columns([1, 4])
712
- with col1:
713
- if st.button("Select All", key="select_all_btn"):
714
- st.session_state.selected_files = list(range(len(files)))
715
- safe_rerun()
716
- if st.button("Clear Selection", key="clear_selection_btn"):
717
- st.session_state.selected_files = []
718
- safe_rerun()
719
-
720
- selected_files = st.multiselect(
721
- "Select files to download",
722
- options=list(range(len(files))),
723
- default=st.session_state.selected_files,
724
- format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
725
- key="file_multiselect"
726
- )
727
- st.session_state.selected_files = selected_files
728
-
729
- if selected_files:
730
- col1, col2, col3, col4 = st.columns(4)
731
- with col1:
732
- download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
733
- with col2:
734
- create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
735
- with col3:
736
- delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
737
- with col4:
738
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
739
-
740
- if st.button("Download Selected", key="download_btn"):
741
- if not os.path.exists(download_dir):
742
- os.makedirs(download_dir)
743
-
744
- async def download_files():
745
- downloaded_paths = []
746
- progress_bar = st.progress(0)
747
- status_text = st.empty()
748
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
749
- for i, idx in enumerate(selected_files):
750
- progress = (i + 1) / len(selected_files)
751
- file_info = files[idx]
752
- status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
753
- progress_bar.progress(progress)
754
- # Download the file (ensure DownloadManager.download_file downloads duplicates)
755
- path = await dm.download_file(file_info, download_dir, url)
756
- if path:
757
- downloaded_paths.append(path)
758
- status_text.empty()
759
- progress_bar.empty()
760
- return downloaded_paths
761
-
762
- downloaded = asyncio.run(download_files())
763
-
764
- if downloaded:
765
- st.success(f"Successfully downloaded {len(downloaded)} files")
766
- # If the user chose to create a ZIP, generate it and offer a download button
767
- if create_zip:
768
- zip_path = create_zip_file(downloaded, download_dir)
769
- st.success(f"Created ZIP file: {zip_path}")
770
- with open(zip_path, "rb") as f:
771
- zip_data = f.read()
772
- st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
773
-
774
- if upload_to_drive and st.session_state.get('google_creds'):
775
- with st.spinner("Uploading to Google Drive..."):
776
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
777
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
778
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
779
- else:
780
- st.error(drive_id)
781
- if delete_after:
782
- for path in downloaded:
783
- try:
784
- os.remove(path)
785
- except Exception as e:
786
- st.warning(f"Could not delete {path}: {e}")
787
- st.info("Deleted original files after ZIP creation")
788
- else:
789
- # Otherwise, generate an individual download button for each file
790
- for path in downloaded:
791
- with open(path, "rb") as f:
792
- file_data = f.read()
793
- st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
794
  else:
795
  st.warning("No files found.")
796
-
797
- # If files were discovered in a previous search, show them here as well.
798
  if st.session_state.discovered_files:
799
  files = st.session_state.discovered_files
800
  st.success(f"Found {len(files)} files!")
801
  col1, col2 = st.columns([1, 4])
802
  with col1:
803
- if st.button("Select All", key="select_all_btn2"):
804
  st.session_state.selected_files = list(range(len(files)))
805
- safe_rerun()
806
- if st.button("Clear Selection", key="clear_selection_btn2"):
807
  st.session_state.selected_files = []
808
- safe_rerun()
809
- selected_files = st.multiselect(
810
- "Select files to download",
811
- options=list(range(len(files))),
812
- default=st.session_state.selected_files,
813
- format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
814
- key="file_multiselect2"
815
- )
816
  st.session_state.selected_files = selected_files
817
  if selected_files:
818
  col1, col2, col3, col4 = st.columns(4)
819
  with col1:
820
- download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input2")
821
  with col2:
822
- create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox2")
823
  with col3:
824
- delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox2")
825
  with col4:
826
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox2")
827
- if st.button("Download Selected", key="download_btn2"):
828
  if not os.path.exists(download_dir):
829
  os.makedirs(download_dir)
830
-
831
  async def download_files():
832
  downloaded_paths = []
833
  progress_bar = st.progress(0)
@@ -838,7 +585,7 @@ def main():
838
  file_info = files[idx]
839
  status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
840
  progress_bar.progress(progress)
841
- path = await dm.download_file(file_info, download_dir, st.session_state.current_url)
842
  if path:
843
  downloaded_paths.append(path)
844
  status_text.empty()
@@ -853,13 +600,14 @@ def main():
853
  with open(zip_path, "rb") as f:
854
  zip_data = f.read()
855
  st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
856
- if upload_to_drive and st.session_state.get('google_creds'):
857
- with st.spinner("Uploading to Google Drive..."):
858
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
859
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
860
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
861
- else:
862
- st.error(drive_id)
 
863
  if delete_after:
864
  for path in downloaded:
865
  try:
@@ -872,163 +620,29 @@ def main():
872
  with open(path, "rb") as f:
873
  file_data = f.read()
874
  st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
875
-
876
  elif mode == "Bing Search":
877
  st.header("Bing Search Mode")
878
  query = st.text_input("Enter search query", key="search_query_input")
879
  num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
880
-
881
- # Check if deep search was requested
882
- if st.session_state.get('do_deep_search', False):
883
- url_to_search = st.session_state.get('deep_search_url')
884
- st.write(f"Running deep search on: {url_to_search}")
885
-
886
- async def perform_deep_search():
887
- async with DownloadManager(
888
- use_proxy=use_proxy,
889
- proxy=proxy
890
- ) as dm:
891
- files = await dm.deep_search(
892
- url=url_to_search,
893
- custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
894
- sublink_limit=max_sublinks,
895
- timeout=sublink_timeout
896
- )
897
- if files:
898
- st.session_state.discovered_files = files
899
- st.session_state.current_url = url_to_search
900
- st.session_state.selected_files = []
901
- else:
902
- st.warning("No files found on this page.")
903
-
904
- # Clear the deep search flag after execution
905
- st.session_state.do_deep_search = False
906
-
907
- asyncio.run(perform_deep_search())
908
-
909
  if st.button("Search", key="search_btn"):
910
  if query:
911
  async def run_search():
912
- async with DownloadManager(
913
- use_proxy=use_proxy,
914
- proxy=proxy,
915
- query=query,
916
- num_results=num_results
917
- ) as dm:
918
  with st.spinner("Searching..."):
919
  urls = await dm.search_bing()
920
  if urls:
921
- st.session_state.search_results = urls # Store URLs in session state
922
  st.success(f"Found {len(urls)} results!")
923
  for i, url in enumerate(urls, 1):
924
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
925
  if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
926
- st.session_state.deep_search_url = url # Store the URL to search
927
- st.session_state.do_deep_search = True # Flag to perform deep search
928
- safe_rerun() # Rerun to apply state change
929
  else:
930
  st.warning("No search results found.")
931
  asyncio.run(run_search())
932
-
933
- # Display search results if they exist
934
- if hasattr(st.session_state, 'search_results') and st.session_state.search_results:
935
- urls = st.session_state.search_results
936
- st.success(f"Found {len(urls)} results!")
937
- for i, url in enumerate(urls, 1):
938
- with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
939
- if st.button(f"Deep Search Result {i}", key=f"deep_search_result_saved_{i}"):
940
- st.session_state.deep_search_url = url
941
- st.session_state.do_deep_search = True
942
- safe_rerun()
943
-
944
- # If files were discovered in a previous search, show them
945
- if st.session_state.discovered_files:
946
- files = st.session_state.discovered_files
947
- st.success(f"Found {len(files)} files on {st.session_state.current_url}!")
948
-
949
- # File selection and download UI
950
- col1, col2 = st.columns([1, 4])
951
- with col1:
952
- if st.button("Select All", key="bing_select_all_btn"):
953
- st.session_state.selected_files = list(range(len(files)))
954
- safe_rerun()
955
- if st.button("Clear Selection", key="bing_clear_selection_btn"):
956
- st.session_state.selected_files = []
957
- safe_rerun()
958
-
959
- selected_files = st.multiselect(
960
- "Select files to download",
961
- options=list(range(len(files))),
962
- default=st.session_state.selected_files,
963
- format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
964
- key="bing_file_multiselect"
965
- )
966
- st.session_state.selected_files = selected_files
967
-
968
- if selected_files:
969
- col1, col2, col3, col4 = st.columns(4)
970
- with col1:
971
- download_dir = st.text_input("Download Directory", value="./downloads", key="bing_download_dir_input")
972
- with col2:
973
- create_zip = st.checkbox("Create ZIP file", value=True, key="bing_create_zip_checkbox")
974
- with col3:
975
- delete_after = st.checkbox("Delete after creating ZIP", key="bing_delete_after_checkbox")
976
- with col4:
977
- upload_to_drive = st.checkbox("Upload to Google Drive", key="bing_upload_drive_checkbox")
978
-
979
- if st.button("Download Selected", key="bing_download_btn"):
980
- if not os.path.exists(download_dir):
981
- os.makedirs(download_dir)
982
-
983
- async def download_files():
984
- downloaded_paths = []
985
- progress_bar = st.progress(0)
986
- status_text = st.empty()
987
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
988
- for i, idx in enumerate(selected_files):
989
- progress = (i + 1) / len(selected_files)
990
- file_info = files[idx]
991
- status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
992
- progress_bar.progress(progress)
993
- path = await dm.download_file(file_info, download_dir, st.session_state.current_url)
994
- if path:
995
- downloaded_paths.append(path)
996
- status_text.empty()
997
- progress_bar.empty()
998
- return downloaded_paths
999
-
1000
- downloaded = asyncio.run(download_files())
1001
-
1002
- if downloaded:
1003
- st.success(f"Successfully downloaded {len(downloaded)} files")
1004
- if create_zip:
1005
- zip_path = create_zip_file(downloaded, download_dir)
1006
- st.success(f"Created ZIP file: {zip_path}")
1007
- with open(zip_path, "rb") as f:
1008
- zip_data = f.read()
1009
- st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
1010
-
1011
- if upload_to_drive and st.session_state.get('google_creds'):
1012
- with st.spinner("Uploading to Google Drive..."):
1013
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
1014
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
1015
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
1016
- else:
1017
- st.error(drive_id)
1018
-
1019
- if delete_after:
1020
- for path in downloaded:
1021
- try:
1022
- os.remove(path)
1023
- except Exception as e:
1024
- st.warning(f"Could not delete {path}: {e}")
1025
- st.info("Deleted original files after ZIP creation")
1026
- else:
1027
- for path in downloaded:
1028
- with open(path, "rb") as f:
1029
- file_data = f.read()
1030
- st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
1031
-
1032
  else: # PDF Summarizer mode
1033
  if summarizer is None:
1034
  st.error("PDF summarization is not available due to model loading errors.")
@@ -1046,15 +660,10 @@ def main():
1046
  reader = PdfReader(temp_pdf.name)
1047
  text = " ".join([page.extract_text() or "" for page in reader.pages])
1048
  os.remove(temp_pdf.name)
1049
- limited_text = text[:3000]
1050
- summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
1051
- st.write("Summary:")
1052
- st.write(summary[0]['summary_text'])
1053
  except Exception as e:
1054
  st.error(f"Error summarizing PDF: {e}")
1055
 
1056
  if __name__ == "__main__":
1057
- try:
1058
- main()
1059
- except Exception as e:
1060
- st.error(f"An error occurred: {str(e)}")
 
26
  import googleapiclient.discovery
27
  import google.auth.transport.requests
28
  from async_timeout import timeout as async_timeout
29
+ import pandas as pd
30
+ from sentence_transformers import SentenceTransformer
31
+ from transformers import pipeline
32
+ import schedule
33
+ import threading
34
+ import time
35
+ import hashlib
36
+ from reportlab.lib.pagesizes import letter
37
+ from reportlab.pdfgen import canvas
38
+ from sklearn.cluster import KMeans
39
+ import numpy as np
40
+
41
  # -------------------- Logging Setup --------------------
42
  logging.basicConfig(
43
  filename='advanced_download_log.txt',
 
45
  format='%(asctime)s - %(levelname)s - %(message)s'
46
  )
47
  logger = logging.getLogger(__name__)
48
+
49
  GOOGLE_OAUTH_CONFIG = {
50
  "web": {
51
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
 
61
  # Playwright Setup
62
  def install_playwright_dependencies():
63
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
64
+ subprocess.run(['apt-get', 'update', '-y'], check=True)
65
+ packages = [
66
+ 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
67
+ 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
68
+ 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
69
+ ]
70
+ subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
71
+ subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
73
  install_playwright_dependencies()
74
 
75
  # Model Loading
76
  @st.cache_resource
77
  def load_models():
78
  try:
79
+ # Load spaCy model
80
  try:
81
  nlp = spacy.load("en_core_web_sm")
82
  except OSError:
 
84
  spacy.cli.download("en_core_web_sm")
85
  nlp = spacy.load("en_core_web_sm")
86
 
87
+ # Load SentenceTransformer
88
  try:
89
+ semantic_model = SentenceTransformer('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
 
 
 
 
 
 
 
90
  except Exception as e:
91
  st.error(f"Error loading SentenceTransformer: {e}")
92
  semantic_model = None
93
 
94
+ # Load Transformers pipeline
95
  try:
96
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
 
97
  except Exception as e:
98
  st.error(f"Error loading Transformers: {e}")
99
  summarizer = None
100
 
101
  return nlp, semantic_model, summarizer
 
102
  except Exception as e:
103
  st.error(f"Error loading models: {e}")
104
  return None, None, None
105
 
106
+ nlp_model, semantic_model, summarizer = load_models()
 
 
107
 
108
  # Utility Functions
109
  def get_random_user_agent():
 
125
  def create_zip_file(file_paths, output_dir):
126
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
127
  zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
 
128
  with zipfile.ZipFile(zip_path, 'w') as zipf:
129
  for file_path in file_paths:
130
  zipf.write(file_path, os.path.basename(file_path))
 
131
  return zip_path
132
 
133
  # Google Drive Functions
 
163
  except Exception as e:
164
  return None, f"Error during token exchange: {e}"
165
 
166
+ def google_drive_upload(file_path, credentials, folder_id=None):
167
  try:
168
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
169
+ file_metadata = {'name': os.path.basename(file_path)}
170
+ if folder_id:
171
+ file_metadata['parents'] = [folder_id]
172
+ media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
173
  created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
174
  return created.get("id", "")
175
  except Exception as e:
176
  return f"Error uploading to Drive: {str(e)}"
177
+
178
+ def create_drive_folder(drive_service, name):
179
+ folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
180
+ folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
181
+ return folder.get('id')
182
+
183
  # DownloadManager Class
184
  class DownloadManager:
185
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
 
207
  }
208
  if self.use_proxy and self.proxy:
209
  opts["proxy"] = {"server": self.proxy}
 
210
  self.browser = await self.playwright.chromium.launch(**opts)
211
  self.context = await self.browser.new_context(user_agent=get_random_user_agent())
212
  self.page = await self.context.new_page()
 
229
  search_url = f"https://www.bing.com/search?q={self.query}"
230
  await self.page.goto(search_url, timeout=30000)
231
  await self.page.wait_for_load_state('networkidle')
 
 
232
  links = await self.page.query_selector_all("li.b_algo h2 a")
233
  for link in links[:self.num_results]:
234
  href = await link.get_attribute('href')
235
  if href:
236
  urls.append(href)
 
237
  return urls
238
  except Exception as e:
239
  logger.error(f"Error searching Bing: {e}")
 
304
  soup = BeautifulSoup(content, 'html.parser')
305
 
306
  default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
307
+ '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
308
+ '.pptx', '.odt', '.txt']
309
  all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
310
 
311
  parsed_base = urlparse(final_url)
 
314
  for a in soup.find_all('a', href=True):
315
  href = a['href'].strip()
316
 
 
317
  if '.php' in href.lower() or 'download' in href.lower():
318
+ full_url = href if href.startswith('http') else f"{base_url}{href}"
 
 
319
  real_url = await self.extract_real_download_url(full_url)
320
  if real_url and real_url != full_url:
321
  found_files.append({
 
326
  })
327
  continue
328
 
 
329
  if any(href.lower().endswith(ext) for ext in all_exts):
330
+ file_url = href if href.startswith('http') else f"{base_url}{href}"
 
 
 
331
  size_str = await self.get_file_size(file_url)
332
  meta = {}
333
  if file_url.lower().endswith('.pdf'):
334
  meta = await self.get_pdf_metadata(file_url)
 
335
  found_files.append({
336
  'url': file_url,
337
  'filename': os.path.basename(file_url.split('?')[0]),
 
347
  if match:
348
  file_id = match.group(1)
349
  break
 
350
  if file_id:
351
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
352
  filename = file_id
 
357
  mt = re.search(r'filename\*?="?([^";]+)', cd)
358
  if mt:
359
  filename = mt.group(1).strip('"').strip()
 
360
  found_files.append({
361
  'url': direct_url,
362
  'filename': filename,
 
366
  except Exception as e:
367
  logger.error(f"Error processing Google Drive link: {e}")
368
 
 
369
  seen_urls = set()
370
  unique_files = []
371
  for f in found_files:
372
  if f['url'] not in seen_urls:
373
  seen_urls.add(f['url'])
374
  unique_files.append(f)
 
375
  return unique_files
376
  except Exception as e:
377
  logger.error(f"Error extracting files from {url}: {e}")
 
386
  while os.path.exists(path):
387
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
388
  counter += 1
 
389
  os.makedirs(save_dir, exist_ok=True)
 
390
  try:
391
  if "drive.google.com" in file_url:
392
  import gdown
393
+ output = gdown.download(file_url, path, quiet=False)
394
+ if output:
395
+ return path
396
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  async with self.context.new_page() as page:
 
 
398
  headers = {
399
  'Accept': '*/*',
400
  'Accept-Encoding': 'gzip, deflate, br',
401
  'Referer': referer
402
  }
 
403
  response = await page.request.get(file_url, headers=headers, timeout=30000)
 
404
  if response.status == 200:
405
  content = await response.body()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  with open(path, 'wb') as f:
407
  f.write(content)
408
  return path
409
  else:
410
  logger.error(f"Download failed with status {response.status}: {file_url}")
411
  return None
 
412
  except Exception as e:
413
  logger.error(f"Error downloading {file_url}: {e}")
414
  return None
 
416
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
417
  if not custom_ext_list:
418
  custom_ext_list = []
 
419
  progress_text = st.empty()
420
  progress_bar = st.progress(0)
421
  file_count_text = st.empty()
 
422
  try:
 
423
  progress_text.text("Analyzing main page...")
424
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
425
  initial_count = len(main_files)
426
  file_count_text.text(f"Found {initial_count} files on main page")
 
 
427
  progress_text.text("Getting sublinks...")
428
  sublinks = await self.get_sublinks(url, sublink_limit)
429
  total_links = len(sublinks)
 
430
  progress_text.text(f"Found {total_links} sublinks to process")
 
431
  if not sublinks:
432
  progress_bar.progress(1.0)
433
  return main_files
 
 
434
  all_files = main_files
 
435
  for i, sublink in enumerate(sublinks, 1):
436
+ progress = i / total_links
437
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
438
  progress_bar.progress(progress)
 
439
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
440
  all_files.extend(sub_files)
 
 
441
  file_count_text.text(f"Found {len(all_files)} total files")
 
 
442
  seen_urls = set()
443
  unique_files = []
 
444
  for f in all_files:
445
  if f['url'] not in seen_urls:
446
  seen_urls.add(f['url'])
447
  unique_files.append(f)
 
448
  final_count = len(unique_files)
449
  progress_text.text(f"Deep search complete!")
450
  file_count_text.text(f"Found {final_count} unique files")
451
  progress_bar.progress(1.0)
 
452
  return unique_files
 
453
  except Exception as e:
454
  logger.error(f"Deep search error: {e}")
455
  progress_text.text(f"Error during deep search: {str(e)}")
456
  return []
457
  finally:
 
458
  await asyncio.sleep(2)
459
  if not st.session_state.get('keep_progress', False):
460
  progress_text.empty()
 
465
  await self.page.goto(url, timeout=30000)
466
  content = await self.page.content()
467
  soup = BeautifulSoup(content, 'html.parser')
 
468
  parsed_base = urlparse(url)
469
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
470
  links = set()
471
  for a in soup.find_all('a', href=True):
472
  href = a['href'].strip()
 
474
  links.add(href)
475
  elif href.startswith('/'):
476
  links.add(f"{base_url}{href}")
 
477
  return list(links)[:limit]
 
478
  except Exception as e:
479
  logger.error(f"Error getting sublinks: {e}")
480
  return []
481
 
482
+ # Utility Functions for New Features
483
+ def extract_keywords(text, n=5):
484
+ doc = nlp_model(text)
485
+ keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
486
+ return keywords
487
 
488
+ def analyze_sentiment(text):
489
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
490
+ result = sentiment_analyzer(text[:512])[0]
491
+ return result['label'], result['score']
492
 
493
+ def get_file_hash(file_path):
494
+ hasher = hashlib.md5()
495
+ with open(file_path, 'rb') as f:
496
+ hasher.update(f.read())
497
+ return hasher.hexdigest()
498
+
499
+ # Main Function
500
  def main():
 
501
  if 'initialized' not in st.session_state:
502
  st.session_state.initialized = True
503
  st.session_state.discovered_files = []
504
  st.session_state.current_url = None
505
  st.session_state.google_creds = None
506
  st.session_state.selected_files = []
507
+ st.session_state.do_deep_search = False
508
+ st.session_state.deep_search_url = None
509
+ st.session_state.search_results = []
510
 
511
  st.title("Advanced File Downloader")
512
+
 
513
  with st.sidebar:
514
  mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
515
  with st.expander("Advanced Options", expanded=True):
516
+ custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
517
+ max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
518
+ sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
520
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
521
  with st.expander("Google Drive Integration", expanded=False):
 
527
  creds, msg = exchange_code_for_credentials(auth_code)
528
  st.session_state.google_creds = creds
529
  st.write(msg)
530
+
 
531
  if mode == "Manual URL":
532
  st.header("Manual URL Mode")
533
  url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
 
534
  col1, col2 = st.columns([3, 1])
535
  with col1:
536
  if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
537
  if url:
538
+ custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
539
+ valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
540
+ if custom_ext_list != valid_ext_list:
541
+ st.warning("Invalid extensions ignored. Use format like '.csv'.")
542
  async def run_deep_search():
543
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
544
+ files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
545
+ return files
 
 
 
 
 
 
 
 
 
 
546
  files = asyncio.run(run_deep_search())
547
  if files:
 
548
  st.session_state.discovered_files = files
549
  st.session_state.current_url = url
550
  st.success(f"Found {len(files)} files!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  else:
552
  st.warning("No files found.")
553
+
 
554
  if st.session_state.discovered_files:
555
  files = st.session_state.discovered_files
556
  st.success(f"Found {len(files)} files!")
557
  col1, col2 = st.columns([1, 4])
558
  with col1:
559
+ if st.button("Select All", key="select_all_btn"):
560
  st.session_state.selected_files = list(range(len(files)))
561
+ if st.button("Clear Selection", key="clear_selection_btn"):
 
562
  st.session_state.selected_files = []
563
+ selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
 
 
 
 
 
 
 
564
  st.session_state.selected_files = selected_files
565
  if selected_files:
566
  col1, col2, col3, col4 = st.columns(4)
567
  with col1:
568
+ download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
569
  with col2:
570
+ create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
571
  with col3:
572
+ delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
573
  with col4:
574
+ upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
575
+ if st.button("Download Selected", key="download_btn"):
576
  if not os.path.exists(download_dir):
577
  os.makedirs(download_dir)
 
578
  async def download_files():
579
  downloaded_paths = []
580
  progress_bar = st.progress(0)
 
585
  file_info = files[idx]
586
  status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
587
  progress_bar.progress(progress)
588
+ path = await dm.download_file(file_info, download_dir, url)
589
  if path:
590
  downloaded_paths.append(path)
591
  status_text.empty()
 
600
  with open(zip_path, "rb") as f:
601
  zip_data = f.read()
602
  st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
603
+ if upload_to_drive and st.session_state.google_creds:
604
+ drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
605
+ folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
606
+ drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
607
+ if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
608
+ st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
609
+ else:
610
+ st.error(drive_id)
611
  if delete_after:
612
  for path in downloaded:
613
  try:
 
620
  with open(path, "rb") as f:
621
  file_data = f.read()
622
  st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
623
+
624
  elif mode == "Bing Search":
625
  st.header("Bing Search Mode")
626
  query = st.text_input("Enter search query", key="search_query_input")
627
  num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  if st.button("Search", key="search_btn"):
629
  if query:
630
  async def run_search():
631
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
 
 
 
 
 
632
  with st.spinner("Searching..."):
633
  urls = await dm.search_bing()
634
  if urls:
635
+ st.session_state.search_results = urls
636
  st.success(f"Found {len(urls)} results!")
637
  for i, url in enumerate(urls, 1):
638
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
639
  if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
640
+ st.session_state.deep_search_url = url
641
+ st.session_state.do_deep_search = True
 
642
  else:
643
  st.warning("No search results found.")
644
  asyncio.run(run_search())
645
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  else: # PDF Summarizer mode
647
  if summarizer is None:
648
  st.error("PDF summarization is not available due to model loading errors.")
 
660
  reader = PdfReader(temp_pdf.name)
661
  text = " ".join([page.extract_text() or "" for page in reader.pages])
662
  os.remove(temp_pdf.name)
663
+ summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
664
+ st.write("Summary:", summary[0]['summary_text'])
 
 
665
  except Exception as e:
666
  st.error(f"Error summarizing PDF: {e}")
667
 
668
  if __name__ == "__main__":
669
+ main()