euler314 commited on
Commit
4ce7f57
·
verified ·
1 Parent(s): 39a7a36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -273
app.py CHANGED
@@ -217,6 +217,7 @@ class DownloadManager:
217
  self.browser = None
218
  self.context = None
219
  self.page = None
 
220
 
221
  async def __aenter__(self):
222
  self.playwright = await async_playwright().start()
@@ -250,6 +251,63 @@ class DownloadManager:
250
  if self.playwright:
251
  await self.playwright.stop()
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  async def get_file_size(self, url):
254
  try:
255
  async with self.context.new_page() as page:
@@ -280,40 +338,32 @@ class DownloadManager:
280
  except Exception:
281
  return {}
282
 
283
- async def extract_real_download_url(self, url):
284
- try:
285
- async with self.context.new_page() as page:
286
- response = await page.goto(url, wait_until='networkidle', timeout=30000)
287
- if response and response.headers.get('location'):
288
- return response.headers['location']
289
- return page.url
290
- except Exception as e:
291
- logger.error(f"Error extracting real download URL: {e}")
292
- return url
293
-
294
  async def extract_downloadable_files(self, url, custom_ext_list):
295
  found_files = []
296
  try:
297
- response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
298
- if not response:
299
- return []
300
-
301
- final_url = self.page.url
302
- if '.php' in final_url or 'download' in final_url:
303
- real_url = await self.extract_real_download_url(final_url)
304
- if real_url != final_url:
305
- found_files.append({
306
- 'url': real_url,
307
- 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
308
- 'size': await self.get_file_size(real_url),
309
- 'metadata': {}
310
- })
311
- return found_files
312
-
313
- await self.page.wait_for_load_state('networkidle', timeout=30000)
 
 
314
  content = await self.page.content()
315
  soup = BeautifulSoup(content, 'html.parser')
316
 
 
317
  default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
318
  '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
319
  all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
@@ -321,70 +371,73 @@ class DownloadManager:
321
  parsed_base = urlparse(final_url)
322
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
323
 
 
 
 
324
  for a in soup.find_all('a', href=True):
325
- href = a['href'].strip()
 
 
 
 
 
 
 
 
 
326
 
327
- # Handle PHP scripts and redirects
328
- if '.php' in href.lower() or 'download' in href.lower():
329
- full_url = href if href.startswith('http') else (
330
- f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
331
- )
332
- real_url = await self.extract_real_download_url(full_url)
333
- if real_url and real_url != full_url:
334
- found_files.append({
335
- 'url': real_url,
336
- 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
337
- 'size': await self.get_file_size(real_url),
338
- 'metadata': {}
339
- })
340
- continue
341
-
342
- # Handle direct file links
343
- if any(href.lower().endswith(ext) for ext in all_exts):
344
- file_url = href if href.startswith('http') else (
345
- f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
346
- )
347
-
348
- size_str = await self.get_file_size(file_url)
349
- meta = {}
350
- if file_url.lower().endswith('.pdf'):
351
- meta = await self.get_pdf_metadata(file_url)
352
-
353
- found_files.append({
354
- 'url': file_url,
355
- 'filename': os.path.basename(file_url.split('?')[0]),
356
- 'size': size_str,
357
- 'metadata': meta
358
- })
359
-
360
- # Handle Google Drive links
361
- elif ("drive.google.com" in href) or ("docs.google.com" in href):
362
- file_id = None
363
- for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
364
- match = re.search(pattern, href)
365
- if match:
366
- file_id = match.group(1)
367
- break
368
-
369
- if file_id:
370
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
371
- filename = file_id
372
- try:
373
- response = await self.page.request.head(direct_url, timeout=15000)
374
- cd = response.headers.get("Content-Disposition", "")
375
- if cd:
376
- mt = re.search(r'filename\*?="?([^";]+)', cd)
377
- if mt:
378
- filename = mt.group(1).strip('"').strip()
379
-
380
  found_files.append({
381
- 'url': direct_url,
382
- 'filename': filename,
383
- 'size': await self.get_file_size(direct_url),
384
  'metadata': {}
385
  })
386
- except Exception as e:
387
- logger.error(f"Error processing Google Drive link: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
  # Make results unique based on URLs
390
  seen_urls = set()
@@ -395,11 +448,11 @@ class DownloadManager:
395
  unique_files.append(f)
396
 
397
  return unique_files
 
398
  except Exception as e:
399
  logger.error(f"Error extracting files from {url}: {e}")
400
  return []
401
-
402
- async def download_file(self, file_info, save_dir, referer):
403
  file_url = file_info['url']
404
  fname = file_info['filename']
405
  path = os.path.join(save_dir, fname)
@@ -412,11 +465,14 @@ class DownloadManager:
412
  os.makedirs(save_dir, exist_ok=True)
413
 
414
  try:
415
- if "drive.google.com" in file_url:
 
 
 
416
  import gdown
417
  try:
418
  st.write(f"Downloading from Google Drive: {fname}")
419
- output = gdown.download(file_url, path, quiet=False)
420
  if output:
421
  return path
422
  return None
@@ -433,7 +489,7 @@ class DownloadManager:
433
  'Referer': referer
434
  }
435
 
436
- response = await page.request.get(file_url, headers=headers, timeout=30000)
437
 
438
  if response.status == 200:
439
  content = await response.body()
@@ -441,61 +497,86 @@ class DownloadManager:
441
  f.write(content)
442
  return path
443
  else:
444
- logger.error(f"Download failed with status {response.status}: {file_url}")
445
  return None
446
 
447
  except Exception as e:
448
  logger.error(f"Error downloading {file_url}: {e}")
449
  return None
450
 
451
- async def search_bing(self):
452
- if not self.query:
453
- return [], []
454
-
455
- search_query = self.query
456
- if "filetype:pdf" not in search_query.lower():
457
- search_query += " filetype:pdf"
458
-
459
- search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
460
-
461
- try:
462
- await self.page.goto(search_url, timeout=30000)
463
- await self.page.wait_for_selector('li.b_algo', timeout=30000)
464
-
465
- results = []
466
- elements = await self.page.query_selector_all('li.b_algo')
467
-
468
- for element in elements:
469
- link = await element.query_selector('h2 a')
470
- if link:
471
- url = await link.get_attribute('href')
472
- if url:
473
- results.append(url)
474
-
475
- return results[:self.num_results]
476
-
477
- except Exception as e:
478
- logger.error(f"Bing search error: {e}")
479
- return []
480
-
481
  async def get_sublinks(self, url, limit=100):
482
  try:
483
- await self.page.goto(url, timeout=30000)
 
 
 
 
 
 
484
  content = await self.page.content()
485
  soup = BeautifulSoup(content, 'html.parser')
486
 
487
- parsed_base = urlparse(url)
488
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
489
 
490
  links = set()
 
 
 
491
  for a in soup.find_all('a', href=True):
492
  href = a['href'].strip()
493
- if href.startswith('http'):
494
  links.add(href)
495
- elif href.startswith('/'):
496
- links.add(f"{base_url}{href}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
- return list(links)[:limit]
 
 
 
 
 
 
 
 
 
 
499
 
500
  except Exception as e:
501
  logger.error(f"Error getting sublinks: {e}")
@@ -510,15 +591,21 @@ class DownloadManager:
510
  file_count_text = st.empty()
511
 
512
  try:
 
 
 
 
 
 
513
  # Search main page
514
  progress_text.text("Analyzing main page...")
515
- main_files = await self.extract_downloadable_files(url, custom_ext_list)
516
  initial_count = len(main_files)
517
  file_count_text.text(f"Found {initial_count} files on main page")
518
 
519
  # Get and search sublinks
520
  progress_text.text("Getting sublinks...")
521
- sublinks = await self.get_sublinks(url, limit=sublink_limit)
522
  total_links = len(sublinks)
523
 
524
  progress_text.text(f"Found {total_links} sublinks to process")
@@ -542,7 +629,14 @@ class DownloadManager:
542
 
543
  # Set timeout for this sublink
544
  async with async_timeout.timeout(timeout):
545
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
 
 
 
 
 
 
 
546
  return sub_files
547
  except asyncio.TimeoutError:
548
  logger.warning(f"Timeout processing sublink: {sublink}")
@@ -603,160 +697,169 @@ def main():
603
 
604
  st.title("Advanced File Downloader")
605
 
606
- # Mode Selection
607
- mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
608
-
609
- # Advanced Options
610
- with st.expander("Advanced Options"):
611
- custom_extensions = st.text_input(
612
- "Custom File Extensions",
613
- placeholder=".csv, .txt, .epub",
614
- key="custom_ext_input"
615
- )
616
- max_sublinks = st.number_input(
617
- "Maximum Sublinks to Process",
618
- min_value=1,
619
- max_value=10000,
620
- value=100,
621
- step=50,
622
- help="Maximum number of sublinks to process from the main page",
623
- key="max_sublinks_input"
624
- )
625
- sublink_timeout = st.number_input(
626
- "Search Timeout (seconds per sublink)",
627
- min_value=1,
628
- max_value=3000,
629
- value=30,
630
- step=5,
631
- help="Maximum time to spend searching each sublink",
632
- key="timeout_input"
633
- )
634
- use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
635
- proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
 
 
636
 
637
- # Google Drive Integration
638
- with st.expander("Google Drive Integration"):
639
- if st.button("Start Google Sign-In", key="google_signin_btn"):
640
- auth_url = get_google_auth_url()
641
- st.markdown(f"[Click here to authorize]({auth_url})")
642
-
643
- auth_code = st.text_input("Enter authorization code", key="auth_code_input")
644
- if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
645
- creds, msg = exchange_code_for_credentials(auth_code)
646
- st.session_state.google_creds = creds
647
- st.write(msg)
648
 
 
649
  if mode == "Manual URL":
650
  st.header("Manual URL Mode")
651
  url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
652
 
653
- if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
654
- if url:
655
- async def run_deep_search():
656
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
657
- files = await dm.deep_search(
658
- url=url,
659
- custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
660
- sublink_limit=max_sublinks,
661
- timeout=sublink_timeout
662
- )
663
- if files:
664
- st.session_state.discovered_files = files
665
- st.session_state.current_url = url
666
- return files
667
-
668
- files = asyncio.run(run_deep_search())
669
- if files:
670
- st.success(f"Found {len(files)} files!")
671
-
672
- # Select All/Clear Selection buttons
673
- col1, col2 = st.columns([1, 4])
674
- with col1:
675
- if st.button("Select All", key="select_all_btn"):
676
- st.session_state.selected_files = list(range(len(files)))
677
- st.experimental_rerun()
678
- if st.button("Clear Selection", key="clear_selection_btn"):
679
- st.session_state.selected_files = []
680
- st.experimental_rerun()
681
-
682
- # File selection
683
- selected_files = st.multiselect(
684
- "Select files to download",
685
- options=list(range(len(files))),
686
- default=st.session_state.selected_files,
687
- format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
688
- key="file_multiselect"
689
- )
690
-
691
- # Update session state
692
- st.session_state.selected_files = selected_files
693
 
694
- if selected_files:
695
- col1, col2, col3, col4 = st.columns(4)
 
 
 
 
696
  with col1:
697
- download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
698
- with col2:
699
- create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
700
- with col3:
701
- delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
702
- with col4:
703
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
 
 
 
 
 
 
 
 
 
 
 
704
 
705
- if st.button("Download Selected", key="download_btn"):
706
- if not os.path.exists(download_dir):
707
- os.makedirs(download_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
 
709
- async def download_files():
710
- downloaded_paths = []
711
- progress_bar = st.progress(0)
712
- status_text = st.empty()
713
 
714
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
715
- for i, idx in enumerate(selected_files):
716
- progress = (i + 1) / len(selected_files)
717
- file_info = files[idx]
 
 
718
 
719
- status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
720
- progress_bar.progress(progress)
 
 
 
 
 
721
 
722
- path = await dm.download_file(
723
- file_info,
724
- download_dir,
725
- url
726
- )
727
- if path:
728
- downloaded_paths.append(path)
729
-
730
- status_text.empty()
731
- progress_bar.empty()
732
- return downloaded_paths
733
-
734
- downloaded = asyncio.run(download_files())
735
-
736
- if downloaded:
737
- st.success(f"Successfully downloaded {len(downloaded)} files")
738
-
739
- if create_zip or upload_to_drive:
740
- zip_path = create_zip_file(downloaded, download_dir)
741
- st.success(f"Created ZIP file: {zip_path}")
742
-
743
- if upload_to_drive and st.session_state.get('google_creds'):
744
- with st.spinner("Uploading to Google Drive..."):
745
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
746
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
747
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
748
- else:
749
- st.error(drive_id)
750
-
751
- if delete_after:
752
- for path in downloaded:
753
- try:
754
- os.remove(path)
755
- except Exception as e:
756
- st.warning(f"Could not delete {path}: {e}")
757
- st.info("Deleted original files after ZIP creation")
758
- else:
759
- st.warning("No files found.")
760
 
761
  # Display current files if they exist in session state
762
  elif st.session_state.discovered_files:
 
217
  self.browser = None
218
  self.context = None
219
  self.page = None
220
+ self.base_domains = set() # Store base domains and their variations
221
 
222
  async def __aenter__(self):
223
  self.playwright = await async_playwright().start()
 
251
  if self.playwright:
252
  await self.playwright.stop()
253
 
254
+ def get_base_domain(self, url):
255
+ """Extract base domain and add variations to self.base_domains"""
256
+ parsed = urlparse(url)
257
+ domain = parsed.netloc.split(':')[0] # Remove port if present
258
+
259
+ # Add the main domain and possible variations
260
+ base_parts = domain.split('.')
261
+ if len(base_parts) > 2:
262
+ main_domain = '.'.join(base_parts[-2:])
263
+ self.base_domains.add(main_domain)
264
+ # Add variations like files.domain.com for domain.com
265
+ self.base_domains.add(domain)
266
+
267
+ # Handle www and non-www versions
268
+ if base_parts[0] == 'www':
269
+ self.base_domains.add('.'.join(base_parts[1:]))
270
+ else:
271
+ self.base_domains.add(f"www.{domain}")
272
+ else:
273
+ self.base_domains.add(domain)
274
+
275
+ return domain
276
+
277
+ def is_related_domain(self, url):
278
+ """Check if URL belongs to any of the known domain variations"""
279
+ parsed = urlparse(url)
280
+ domain = parsed.netloc.split(':')[0]
281
+
282
+ # Check if this domain or any of its parts match our base domains
283
+ parts = domain.split('.')
284
+ for i in range(len(parts) - 1):
285
+ check_domain = '.'.join(parts[i:])
286
+ if check_domain in self.base_domains:
287
+ return True
288
+ return False
289
+
290
+ async def get_real_url(self, url):
291
+ """Follow redirects and get the final URL"""
292
+ try:
293
+ async with self.context.new_page() as page:
294
+ response = await page.goto(url, wait_until='networkidle', timeout=30000)
295
+ final_url = page.url
296
+
297
+ # Check for meta refresh redirects
298
+ content = await page.content()
299
+ soup = BeautifulSoup(content, 'html.parser')
300
+ meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
301
+ if meta_refresh:
302
+ content = meta_refresh.get('content', '')
303
+ if 'url=' in content.lower():
304
+ final_url = content.split('url=')[-1].strip("'").strip('"')
305
+
306
+ return final_url, response.headers if response else {}
307
+ except Exception as e:
308
+ logger.error(f"Error getting real URL for {url}: {e}")
309
+ return url, {}
310
+
311
  async def get_file_size(self, url):
312
  try:
313
  async with self.context.new_page() as page:
 
338
  except Exception:
339
  return {}
340
 
 
 
 
 
 
 
 
 
 
 
 
341
  async def extract_downloadable_files(self, url, custom_ext_list):
342
  found_files = []
343
  try:
344
+ # Follow redirects and get the final URL
345
+ final_url, headers = await self.get_real_url(url)
346
+
347
+ # Add this domain to our known domains
348
+ self.get_base_domain(final_url)
349
+
350
+ # Check if the URL itself is a file
351
+ content_type = headers.get('content-type', '').lower()
352
+ if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
353
+ found_files.append({
354
+ 'url': final_url,
355
+ 'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
356
+ 'size': await self.get_file_size(final_url),
357
+ 'metadata': {}
358
+ })
359
+ return found_files
360
+
361
+ # Load the page
362
+ await self.page.goto(final_url, timeout=30000, wait_until='networkidle')
363
  content = await self.page.content()
364
  soup = BeautifulSoup(content, 'html.parser')
365
 
366
+ # Define extensions to look for
367
  default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
368
  '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
369
  all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
 
371
  parsed_base = urlparse(final_url)
372
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
373
 
374
+ # Find all links including those in scripts and other elements
375
+ links = set()
376
+ # Regular links
377
  for a in soup.find_all('a', href=True):
378
+ links.add(a['href'])
379
+ # Script-embedded links
380
+ scripts = soup.find_all('script')
381
+ for script in scripts:
382
+ if script.string:
383
+ urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
384
+ links.update(urls)
385
+
386
+ for href in links:
387
+ href = href.strip()
388
 
389
+ # Skip empty or javascript links
390
+ if not href or href.startswith(('javascript:', '#', 'mailto:')):
391
+ continue
392
+
393
+ # Handle both direct file links and PHP/script downloads
394
+ if '.php' in href.lower() or 'download' in href.lower() or 'visit' in href.lower():
395
+ try:
396
+ # Convert to absolute URL if needed
397
+ if not href.startswith(('http://', 'https://')):
398
+ if href.startswith('/'):
399
+ href = base_url + href
400
+ else:
401
+ href = base_url + '/' + href
402
+
403
+ # Follow the link to get the real file
404
+ real_url, real_headers = await self.get_real_url(href)
405
+
406
+ # Check if it leads to a file
407
+ content_type = real_headers.get('content-type', '').lower()
408
+ if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  found_files.append({
410
+ 'url': real_url,
411
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
412
+ 'size': await self.get_file_size(real_url),
413
  'metadata': {}
414
  })
415
+ except Exception as e:
416
+ logger.error(f"Error processing PHP/script link {href}: {e}")
417
+ continue
418
+
419
+ # Handle direct file links
420
+ elif any(href.lower().endswith(ext) for ext in all_exts):
421
+ # Convert to absolute URL if needed
422
+ if not href.startswith(('http://', 'https://')):
423
+ if href.startswith('/'):
424
+ href = base_url + href
425
+ else:
426
+ href = base_url + '/' + href
427
+
428
+ # Verify if it's from a related domain
429
+ if self.is_related_domain(href):
430
+ size_str = await self.get_file_size(href)
431
+ meta = {}
432
+ if href.lower().endswith('.pdf'):
433
+ meta = await self.get_pdf_metadata(href)
434
+
435
+ found_files.append({
436
+ 'url': href,
437
+ 'filename': os.path.basename(href.split('?')[0]),
438
+ 'size': size_str,
439
+ 'metadata': meta
440
+ })
441
 
442
  # Make results unique based on URLs
443
  seen_urls = set()
 
448
  unique_files.append(f)
449
 
450
  return unique_files
451
+
452
  except Exception as e:
453
  logger.error(f"Error extracting files from {url}: {e}")
454
  return []
455
+ async def download_file(self, file_info, save_dir, referer):
 
456
  file_url = file_info['url']
457
  fname = file_info['filename']
458
  path = os.path.join(save_dir, fname)
 
465
  os.makedirs(save_dir, exist_ok=True)
466
 
467
  try:
468
+ # Get the real URL first
469
+ real_url, _ = await self.get_real_url(file_url)
470
+
471
+ if "drive.google.com" in real_url:
472
  import gdown
473
  try:
474
  st.write(f"Downloading from Google Drive: {fname}")
475
+ output = gdown.download(real_url, path, quiet=False)
476
  if output:
477
  return path
478
  return None
 
489
  'Referer': referer
490
  }
491
 
492
+ response = await page.request.get(real_url, headers=headers, timeout=30000)
493
 
494
  if response.status == 200:
495
  content = await response.body()
 
497
  f.write(content)
498
  return path
499
  else:
500
+ logger.error(f"Download failed with status {response.status}: {real_url}")
501
  return None
502
 
503
  except Exception as e:
504
  logger.error(f"Error downloading {file_url}: {e}")
505
  return None
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  async def get_sublinks(self, url, limit=100):
508
  try:
509
+ # Get the real URL first
510
+ real_url, _ = await self.get_real_url(url)
511
+ await self.page.goto(real_url, timeout=30000)
512
+
513
+ # Wait for dynamic content
514
+ await self.page.wait_for_load_state('networkidle')
515
+
516
  content = await self.page.content()
517
  soup = BeautifulSoup(content, 'html.parser')
518
 
519
+ parsed_base = urlparse(real_url)
520
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
521
+ current_path = os.path.dirname(parsed_base.path)
522
 
523
  links = set()
524
+
525
+ # Find links from various sources
526
+ # 1. Regular links
527
  for a in soup.find_all('a', href=True):
528
  href = a['href'].strip()
529
+ if href and not href.startswith(('javascript:', '#', 'mailto:')):
530
  links.add(href)
531
+
532
+ # 2. Script-embedded links
533
+ scripts = soup.find_all('script')
534
+ for script in scripts:
535
+ if script.string:
536
+ urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
537
+ links.update(urls)
538
+
539
+ # 3. Form actions
540
+ forms = soup.find_all('form', action=True)
541
+ for form in forms:
542
+ links.add(form['action'])
543
+
544
+ # Process and clean links
545
+ clean_links = set()
546
+ for href in links:
547
+ try:
548
+ # Skip empty links
549
+ if not href.strip():
550
+ continue
551
+
552
+ # Convert to absolute URL
553
+ if href.startswith('http'):
554
+ full_url = href
555
+ elif href.startswith('//'):
556
+ full_url = parsed_base.scheme + ':' + href
557
+ elif href.startswith('/'):
558
+ full_url = base_url + href
559
+ else:
560
+ # Handle relative paths
561
+ if current_path and current_path != '/':
562
+ full_url = base_url + current_path + '/' + href
563
+ else:
564
+ full_url = base_url + '/' + href
565
+
566
+ # Clean the URL
567
+ full_url = full_url.split('#')[0] # Remove fragments
568
 
569
+ # Only add if it's a related domain
570
+ if self.is_related_domain(full_url):
571
+ clean_links.add(full_url)
572
+
573
+ except Exception as e:
574
+ logger.error(f"Error processing link {href}: {e}")
575
+ continue
576
+
577
+ # Sort links for consistency
578
+ sorted_links = sorted(list(clean_links))
579
+ return sorted_links[:limit]
580
 
581
  except Exception as e:
582
  logger.error(f"Error getting sublinks: {e}")
 
591
  file_count_text = st.empty()
592
 
593
  try:
594
+ # Initialize base domains with the original URL
595
+ self.get_base_domain(url)
596
+
597
+ # Get the real initial URL
598
+ real_url, _ = await self.get_real_url(url)
599
+
600
  # Search main page
601
  progress_text.text("Analyzing main page...")
602
+ main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
603
  initial_count = len(main_files)
604
  file_count_text.text(f"Found {initial_count} files on main page")
605
 
606
  # Get and search sublinks
607
  progress_text.text("Getting sublinks...")
608
+ sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
609
  total_links = len(sublinks)
610
 
611
  progress_text.text(f"Found {total_links} sublinks to process")
 
629
 
630
  # Set timeout for this sublink
631
  async with async_timeout.timeout(timeout):
632
+ # Get real URL before processing
633
+ real_sublink, _ = await self.get_real_url(sublink)
634
+ sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list)
635
+
636
+ if sub_files:
637
+ logger.info(f"Found {len(sub_files)} files at {real_sublink}")
638
+ st.write(f"Found {len(sub_files)} files at {real_sublink}")
639
+
640
  return sub_files
641
  except asyncio.TimeoutError:
642
  logger.warning(f"Timeout processing sublink: {sublink}")
 
697
 
698
  st.title("Advanced File Downloader")
699
 
700
+ # Sidebar
701
+ with st.sidebar:
702
+ # Mode Selection
703
+ mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
704
+
705
+ # Advanced Options
706
+ with st.expander("Advanced Options", expanded=True):
707
+ custom_extensions = st.text_input(
708
+ "Custom File Extensions",
709
+ placeholder=".csv, .txt, .epub",
710
+ key="custom_ext_input"
711
+ )
712
+ max_sublinks = st.number_input(
713
+ "Maximum Sublinks to Process",
714
+ min_value=1,
715
+ max_value=10000,
716
+ value=100,
717
+ step=50,
718
+ help="Maximum number of sublinks to process from the main page",
719
+ key="max_sublinks_input"
720
+ )
721
+ sublink_timeout = st.number_input(
722
+ "Search Timeout (seconds per sublink)",
723
+ min_value=1,
724
+ max_value=3000,
725
+ value=30,
726
+ step=5,
727
+ help="Maximum time to spend searching each sublink",
728
+ key="timeout_input"
729
+ )
730
+ use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
731
+ proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
732
 
733
+ # Google Drive Integration
734
+ with st.expander("Google Drive Integration", expanded=False):
735
+ if st.button("Start Google Sign-In", key="google_signin_btn"):
736
+ auth_url = get_google_auth_url()
737
+ st.markdown(f"[Click here to authorize]({auth_url})")
738
+
739
+ auth_code = st.text_input("Enter authorization code", key="auth_code_input")
740
+ if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
741
+ creds, msg = exchange_code_for_credentials(auth_code)
742
+ st.session_state.google_creds = creds
743
+ st.write(msg)
744
 
745
+ # Main content area
746
  if mode == "Manual URL":
747
  st.header("Manual URL Mode")
748
  url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
749
 
750
+ col1, col2 = st.columns([3, 1])
751
+ with col1:
752
+ if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
753
+ if url:
754
+ async def run_deep_search():
755
+ try:
756
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
757
+ files = await dm.deep_search(
758
+ url=url,
759
+ custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
760
+ sublink_limit=int(max_sublinks),
761
+ timeout=int(sublink_timeout)
762
+ )
763
+ if files:
764
+ st.session_state.discovered_files = files
765
+ st.session_state.current_url = url
766
+ return files
767
+ except Exception as e:
768
+ st.error(f"Error during deep search: {str(e)}")
769
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
 
771
+ files = asyncio.run(run_deep_search())
772
+ if files:
773
+ st.success(f"Found {len(files)} files!")
774
+
775
+ # Select All/Clear Selection buttons
776
+ col1, col2 = st.columns([1, 4])
777
  with col1:
778
+ if st.button("Select All", key="select_all_btn"):
779
+ st.session_state.selected_files = list(range(len(files)))
780
+ st.experimental_rerun()
781
+ if st.button("Clear Selection", key="clear_selection_btn"):
782
+ st.session_state.selected_files = []
783
+ st.experimental_rerun()
784
+
785
+ # File selection
786
+ selected_files = st.multiselect(
787
+ "Select files to download",
788
+ options=list(range(len(files))),
789
+ default=st.session_state.selected_files,
790
+ format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
791
+ key="file_multiselect"
792
+ )
793
+
794
+ # Update session state
795
+ st.session_state.selected_files = selected_files
796
 
797
+ if selected_files:
798
+ col1, col2, col3, col4 = st.columns(4)
799
+ with col1:
800
+ download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
801
+ with col2:
802
+ create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
803
+ with col3:
804
+ delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
805
+ with col4:
806
+ upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
807
+
808
+ if st.button("Download Selected", key="download_btn"):
809
+ if not os.path.exists(download_dir):
810
+ os.makedirs(download_dir)
811
+
812
+ async def download_files():
813
+ downloaded_paths = []
814
+ progress_bar = st.progress(0)
815
+ status_text = st.empty()
816
+
817
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
818
+ for i, idx in enumerate(selected_files):
819
+ progress = (i + 1) / len(selected_files)
820
+ file_info = files[idx]
821
+
822
+ status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
823
+ progress_bar.progress(progress)
824
+
825
+ path = await dm.download_file(
826
+ file_info,
827
+ download_dir,
828
+ url
829
+ )
830
+ if path:
831
+ downloaded_paths.append(path)
832
+
833
+ status_text.empty()
834
+ progress_bar.empty()
835
+ return downloaded_paths
836
 
837
+ downloaded = asyncio.run(download_files())
 
 
 
838
 
839
+ if downloaded:
840
+ st.success(f"Successfully downloaded {len(downloaded)} files")
841
+
842
+ if create_zip or upload_to_drive:
843
+ zip_path = create_zip_file(downloaded, download_dir)
844
+ st.success(f"Created ZIP file: {zip_path}")
845
 
846
+ if upload_to_drive and st.session_state.get('google_creds'):
847
+ with st.spinner("Uploading to Google Drive..."):
848
+ drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
849
+ if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
850
+ st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
851
+ else:
852
+ st.error(drive_id)
853
 
854
+ if delete_after:
855
+ for path in downloaded:
856
+ try:
857
+ os.remove(path)
858
+ except Exception as e:
859
+ st.warning(f"Could not delete {path}: {e}")
860
+ st.info("Deleted original files after ZIP creation")
861
+ else:
862
+ st.warning("No files found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
 
864
  # Display current files if they exist in session state
865
  elif st.session_state.discovered_files: