euler314 commited on
Commit
3b03ee1
·
verified ·
1 Parent(s): 942484e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -1507
app.py CHANGED
@@ -284,1525 +284,148 @@ class DownloadManager:
284
  logger.error(f"Error extracting real download URL: {e}")
285
  return url
286
 
287
- async def extract_downloadable_files(self, url, custom_ext_list):
288
- found_files = []
289
- try:
290
- response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
291
- if not response:
292
- return []
293
-
294
- final_url = self.page.url
295
- if '.php' in final_url or 'download' in final_url:
296
- real_url = await self.extract_real_download_url(final_url)
297
- if real_url != final_url:
298
- found_files.append({
299
- 'url': real_url,
300
- 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
301
- 'size': await self.get_file_size(real_url),
302
- 'metadata': {}
303
- })
304
- return found_files
305
-
306
- await self.page.wait_for_load_state('networkidle', timeout=30000)
307
- content = await self.page.content()
308
- soup = BeautifulSoup(content, 'html.parser')
309
-
310
- default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
311
- '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
312
- '.pptx', '.odt', '.txt']
313
- all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
314
-
315
- parsed_base = urlparse(final_url)
316
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
317
- path_base = os.path.dirname(parsed_base.path)
318
-
319
- # Process all anchor tags
320
- for a in soup.find_all('a', href=True):
321
- href = a['href'].strip()
322
-
323
- if '.php' in href.lower() or 'download' in href.lower():
324
- full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
325
- real_url = await self.extract_real_download_url(full_url)
326
- if real_url and real_url != full_url:
327
- found_files.append({
328
- 'url': real_url,
329
- 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
330
- 'size': await self.get_file_size(real_url),
331
- 'metadata': {}
332
- })
333
- continue
334
-
335
- if any(href.lower().endswith(ext) for ext in all_exts):
336
- file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
337
- size_str = await self.get_file_size(file_url)
338
- meta = {}
339
- if file_url.lower().endswith('.pdf'):
340
- meta = await self.get_pdf_metadata(file_url)
341
- found_files.append({
342
- 'url': file_url,
343
- 'filename': os.path.basename(file_url.split('?')[0]),
344
- 'size': size_str,
345
- 'metadata': meta
346
- })
347
-
348
- # Handle Google Drive links
349
- elif ("drive.google.com" in href) or ("docs.google.com" in href):
350
- file_id = None
351
- for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
352
- match = re.search(pattern, href)
353
- if match:
354
- file_id = match.group(1)
355
- break
356
- if file_id:
357
- # Get file info to determine type and view-only status
358
- file_type, is_view_only = await self.get_google_drive_file_info(file_id)
359
-
360
- # Create a more informative filename based on info
361
- filename = f"gdrive_{file_id}"
362
- if file_type:
363
- filename = f"{filename}.{file_type}"
364
-
365
- size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
366
-
367
- found_files.append({
368
- 'url': href, # Use original URL
369
- 'filename': filename,
370
- 'size': size_str,
371
- 'metadata': {
372
- 'view_only': is_view_only,
373
- 'file_type': file_type,
374
- 'file_id': file_id
375
- }
376
- })
377
-
378
- # Also check for files in other elements (iframe, embed, object, etc.)
379
- other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
380
- for elem in other_elements:
381
- src = elem.get('src') or elem.get('data')
382
- if src and any(src.lower().endswith(ext) for ext in all_exts):
383
- file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
384
- size_str = await self.get_file_size(file_url)
385
- meta = {}
386
- if file_url.lower().endswith('.pdf'):
387
- meta = await self.get_pdf_metadata(file_url)
388
- found_files.append({
389
- 'url': file_url,
390
- 'filename': os.path.basename(file_url.split('?')[0]),
391
- 'size': size_str,
392
- 'metadata': meta
393
- })
394
-
395
- # Check for file links in onclick attributes
396
- onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
397
- for elem in onclick_elements:
398
- onclick = await elem.get_attribute('onclick')
399
- urls = re.findall(r'(https?://[^\'"]+)', onclick)
400
- for url_match in urls:
401
- if any(url_match.lower().endswith(ext) for ext in all_exts):
402
- size_str = await self.get_file_size(url_match)
403
- meta = {}
404
- if url_match.lower().endswith('.pdf'):
405
- meta = await self.get_pdf_metadata(url_match)
406
- found_files.append({
407
- 'url': url_match,
408
- 'filename': os.path.basename(url_match.split('?')[0]),
409
- 'size': size_str,
410
- 'metadata': meta
411
- })
412
-
413
- seen_urls = set()
414
- unique_files = []
415
- for f in found_files:
416
- if f['url'] not in seen_urls:
417
- seen_urls.add(f['url'])
418
- unique_files.append(f)
419
- return unique_files
420
- except Exception as e:
421
- logger.error(f"Error extracting files from {url}: {e}")
422
- return []
423
-
424
- async def download_file(self, file_info, save_dir, referer):
425
- file_url = file_info['url']
426
- fname = file_info['filename']
427
- path = os.path.join(save_dir, fname)
428
- base, ext = os.path.splitext(fname)
429
- counter = 1
430
- while os.path.exists(path):
431
- path = os.path.join(save_dir, f"{base}_{counter}{ext}")
432
- counter += 1
433
- os.makedirs(save_dir, exist_ok=True)
434
-
435
- try:
436
- # Special handling for Google Drive files
437
- if "drive.google.com" in file_url or "docs.google.com" in file_url:
438
- # Check if it's marked as view-only in metadata
439
- is_view_only = file_info.get('metadata', {}).get('view_only', False)
440
-
441
- # For view-only files, try our most robust approach first
442
- if is_view_only:
443
- logger.info(f"Attempting to download view-only file: {file_url}")
444
- result_path = await self.force_download_viewonly(file_info, path)
445
- if result_path:
446
- return result_path
447
-
448
- # If that failed, try the regular download approach
449
- logger.info("Primary method failed, trying fallback methods")
450
-
451
- # Try regular download methods
452
- success = await self.download_from_google_drive(file_url, path)
453
- if success:
454
- return path
455
-
456
- # If all methods failed for Google Drive, try one last approach
457
- logger.warning("All standard methods failed, attempting force download")
458
- result_path = await self.force_download_viewonly(file_info, path)
459
- return result_path if result_path else None
460
-
461
- # Original code for non-Google Drive downloads
462
- async with self.context.new_page() as page:
463
- headers = {
464
- 'Accept': '*/*',
465
- 'Accept-Encoding': 'gzip, deflate, br',
466
- 'Referer': referer
467
- }
468
- response = await page.request.get(file_url, headers=headers, timeout=30000)
469
- if response.status == 200:
470
- content = await response.body()
471
- with open(path, 'wb') as f:
472
- f.write(content)
473
- return path
474
- else:
475
- logger.error(f"Download failed with status {response.status}: {file_url}")
476
- return None
477
- except Exception as e:
478
- logger.error(f"Error downloading {file_url}: {e}")
479
- return None
480
-
481
- async def force_download_viewonly(self, file_info, save_path):
482
- """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
483
- try:
484
- # Extract file ID
485
- file_id = file_info.get('metadata', {}).get('file_id')
486
- if not file_id:
487
- url = file_info['url']
488
- for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
489
- match = re.search(pattern, url)
490
- if match:
491
- file_id = match.group(1)
492
- break
493
-
494
- if not file_id:
495
- logger.error("Could not extract file ID")
496
- return None
497
-
498
- file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
499
- base, ext = os.path.splitext(save_path)
500
- if not ext:
501
- save_path = f"{base}.{file_type}"
502
-
503
- logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
504
-
505
- # Create a dedicated browser instance with better resolution
506
- browser = await self.playwright.chromium.launch(
507
- headless=True,
508
- args=[
509
- '--no-sandbox',
510
- '--disable-setuid-sandbox',
511
- '--disable-dev-shm-usage',
512
- '--disable-web-security',
513
- '--disable-features=IsolateOrigins,site-per-process',
514
- '--disable-site-isolation-trials'
515
- ]
516
- )
517
-
518
- # Use higher resolution for better quality
519
- context = await browser.new_context(
520
- viewport={'width': 1600, 'height': 1200},
521
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
522
- device_scale_factor=2.0
523
- )
524
-
525
- page = await context.new_page()
526
-
527
- try:
528
- # Go to the file view page
529
- logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
530
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
531
- await page.wait_for_load_state('networkidle')
532
- await page.wait_for_timeout(5000) # Wait longer for everything to load
533
-
534
- # Create temp directory
535
- temp_dir = tempfile.mkdtemp()
536
-
537
- # Special handling for PDFs
538
- if file_type.lower() == 'pdf':
539
- # Check if there's a pagination control
540
- pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
541
-
542
- # Try multiple methods to extract total pages
543
- total_pages = await page.evaluate("""
544
- () => {
545
- // Method 1: Check page counter text
546
- const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
547
- const text = el.textContent || '';
548
- return /\\d+\\s*\\/\\s*\\d+/.test(text);
549
- });
550
-
551
- if (pageCounters.length > 0) {
552
- const text = pageCounters[0].textContent || '';
553
- const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
554
- if (match && match[2]) return parseInt(match[2]);
555
- }
556
-
557
- // Method 2: Check actual page elements
558
- const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
559
- if (pageElements.length > 0) return pageElements.length;
560
-
561
- // Method 3: Look for page thumbnails
562
- const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
563
- if (thumbnails.length > 0) return thumbnails.length;
564
-
565
- // Fallback: conservative guess based on UI
566
- return 50; // Safe default when we can't determine
567
- }
568
- """)
569
-
570
- logger.info(f"Detected {total_pages} pages in PDF")
571
-
572
- if total_pages <= 1:
573
- # Additional check - sometimes the page count detection fails
574
- # Let's double-check by looking for next/previous buttons
575
- next_button = await page.query_selector('button[aria-label="Next page"]')
576
- if next_button:
577
- disabled = await next_button.get_attribute('disabled')
578
- if not disabled:
579
- logger.info("Found next button that's not disabled, document has multiple pages")
580
- total_pages = 100 # Set a high number, we'll stop when we can't go further
581
-
582
- # If we still think it's a single page, use a more direct approach
583
- if total_pages <= 1:
584
- # Single page approach
585
- logger.info("Using single-page capture approach")
586
-
587
- # Take a screenshot of the current view (should be the full document or first page)
588
- screenshot_path = os.path.join(temp_dir, "page.png")
589
-
590
- # Try to screenshot just the document area if we can find it
591
- document_area = await page.query_selector('.drive-viewer-paginated-page')
592
- if document_area:
593
- await document_area.screenshot(path=screenshot_path)
594
- else:
595
- # Otherwise take a full screenshot
596
- await page.screenshot(path=screenshot_path)
597
-
598
- # Convert to PDF
599
- from PIL import Image
600
- from reportlab.pdfgen import canvas as pdf_canvas
601
-
602
- img = Image.open(screenshot_path)
603
- width, height = img.size
604
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
605
- c.drawImage(screenshot_path, 0, 0, width, height)
606
- c.save()
607
-
608
- os.remove(screenshot_path)
609
- os.rmdir(temp_dir)
610
-
611
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
612
- return save_path
613
- return None
614
-
615
- # Multi-page approach
616
- logger.info(f"Using multi-page capture approach for {total_pages} pages")
617
-
618
- # CRITICAL: We need to go to the first page first
619
- # Check if we need to reset to first page
620
- current_page_text = await page.evaluate("""
621
- () => {
622
- const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
623
- const text = el.textContent || '';
624
- return /\\d+\\s*\\/\\s*\\d+/.test(text);
625
- });
626
-
627
- if (pageCounters.length > 0) {
628
- return pageCounters[0].textContent || '';
629
- }
630
- return '';
631
- }
632
- """)
633
-
634
- current_page = 1
635
- if current_page_text:
636
- match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
637
- if match:
638
- current_page = int(match.group(1))
639
-
640
- # If we're not on page 1, go back to first page
641
- if current_page > 1:
642
- logger.info(f"Currently on page {current_page}, navigating back to page 1")
643
-
644
- # Look for an input field where we can directly set the page number
645
- page_input = await page.query_selector('input[aria-label="Page"]')
646
- if page_input:
647
- await page_input.fill("1")
648
- await page_input.press("Enter")
649
- await page.wait_for_timeout(1000)
650
- else:
651
- # Use prev button to go back to first page
652
- prev_button = await page.query_selector('button[aria-label="Previous page"]')
653
- if prev_button:
654
- # Keep clicking until we can't anymore
655
- for _ in range(current_page - 1):
656
- try:
657
- await prev_button.click()
658
- await page.wait_for_timeout(500)
659
- except Exception as e:
660
- logger.warning(f"Error clicking prev button: {e}")
661
- break
662
-
663
- # Capture each page
664
- screenshots = []
665
- page_num = 1
666
- max_tries = min(total_pages + 10, 200) # Set a reasonable limit
667
- next_button = await page.query_selector('button[aria-label="Next page"]')
668
-
669
- # Maximize the PDF view if possible
670
- await page.evaluate("""
671
- () => {
672
- // Try to find and click any "full page" or "maximize" buttons
673
- const fullViewButtons = Array.from(document.querySelectorAll('button'))
674
- .filter(b => b.textContent?.includes('Full') ||
675
- b.getAttribute('aria-label')?.includes('Full') ||
676
- b.getAttribute('aria-label')?.includes('fit page'));
677
- if (fullViewButtons.length > 0) {
678
- fullViewButtons[0].click();
679
- }
680
- }
681
- """)
682
-
683
- await page.wait_for_timeout(1000) # Wait for view to adjust
684
-
685
- while page_num <= max_tries:
686
- # Wait for the page to be fully loaded
687
- await page.wait_for_timeout(800)
688
-
689
- # Take a screenshot of the current page
690
- screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
691
-
692
- # Try different methods to identify and capture just the page content
693
- page_content = await page.query_selector('.drive-viewer-paginated-page')
694
- if page_content:
695
- # Found the specific page element
696
- await page_content.screenshot(path=screenshot_path)
697
- else:
698
- # Fall back to screenshot of visible viewport
699
- await page.screenshot(path=screenshot_path)
700
-
701
- screenshots.append(screenshot_path)
702
- logger.info(f"Captured page {page_num}")
703
-
704
- # Check if we have a disabled next button (reached the end)
705
- if next_button:
706
- is_disabled = await next_button.get_attribute('disabled')
707
- if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
708
- logger.info(f"Reached end of document after {page_num} pages")
709
- break
710
-
711
- # Click the next button
712
- try:
713
- await next_button.click()
714
- await page.wait_for_timeout(800) # Wait for page transition
715
- page_num += 1
716
- except Exception as e:
717
- logger.error(f"Error clicking next button: {e}")
718
- # Try to get a fresh reference to the button
719
- next_button = await page.query_selector('button[aria-label="Next page"]')
720
- if not next_button:
721
- logger.warning("Next button disappeared, assuming end of document")
722
- break
723
- else:
724
- # Try to find the next button again
725
- next_button = await page.query_selector('button[aria-label="Next page"]')
726
- if not next_button:
727
- logger.warning("Could not find next button, stopping navigation")
728
- break
729
-
730
- # Double-check if we've reached the expected total
731
- if page_num >= total_pages:
732
- logger.info(f"Reached expected total of {total_pages} pages")
733
- break
734
-
735
- # Combine screenshots into PDF
736
- logger.info(f"Creating PDF from {len(screenshots)} captured pages")
737
-
738
- from PIL import Image
739
- from reportlab.lib.pagesizes import letter
740
- from reportlab.pdfgen import canvas as pdf_canvas
741
-
742
- # Use the size of the first screenshot to set PDF dimensions
743
- if screenshots:
744
- try:
745
- img = Image.open(screenshots[0])
746
- width, height = img.size
747
-
748
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
749
-
750
- for screenshot in screenshots:
751
- try:
752
- if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
753
- img = Image.open(screenshot)
754
- c.drawImage(screenshot, 0, 0, width, height)
755
- c.showPage()
756
- except Exception as e:
757
- logger.error(f"Error adding page to PDF: {e}")
758
-
759
- c.save()
760
-
761
- # Clean up screenshots
762
- for screenshot in screenshots:
763
- if os.path.exists(screenshot):
764
- os.remove(screenshot)
765
-
766
- logger.info(f"Successfully created PDF with {len(screenshots)} pages")
767
- except Exception as e:
768
- logger.error(f"Error creating PDF: {e}")
769
- else:
770
- logger.error("No screenshots captured to create PDF")
771
- else:
772
- # Non-PDF file handling
773
- screenshot_path = os.path.join(temp_dir, "file.png")
774
- await page.screenshot(path=screenshot_path)
775
-
776
- if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
777
- # For document types, try to export directly
778
- await self.export_google_doc(file_id, file_type, save_path)
779
- else:
780
- # For other types, save the screenshot with appropriate extension
781
- shutil.copy(screenshot_path, save_path)
782
-
783
- os.remove(screenshot_path)
784
-
785
- # Clean up temp directory
786
- try:
787
- os.rmdir(temp_dir)
788
- except:
789
- pass
790
-
791
- # Close browser
792
- await browser.close()
793
-
794
- # Verify file exists and has content
795
- if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
796
- logger.info(f"Successfully downloaded file to {save_path}")
797
- return save_path
798
- else:
799
- logger.error(f"Generated file is too small or missing: {save_path}")
800
- return None
801
-
802
- except Exception as e:
803
- logger.error(f"Error during force download: {e}")
804
- if browser:
805
- await browser.close()
806
- return None
807
-
808
- except Exception as e:
809
- logger.error(f"Force download preparation failed: {e}")
810
- return None
811
-
812
- async def download_from_google_drive(self, url, save_path):
813
- """Enhanced method to download from Google Drive with multiple fallback approaches"""
814
- # Extract the file ID from different URL formats
815
- file_id = None
816
- url_patterns = [
817
- r'drive\.google\.com/file/d/([^/]+)',
818
- r'drive\.google\.com/open\?id=([^&]+)',
819
- r'docs\.google\.com/\w+/d/([^/]+)',
820
- r'id=([^&]+)',
821
- r'drive\.google\.com/uc\?id=([^&]+)',
822
- ]
823
-
824
- for pattern in url_patterns:
825
- match = re.search(pattern, url)
826
- if match:
827
- file_id = match.group(1)
828
- break
829
-
830
- if not file_id:
831
- logger.error(f"Could not extract file ID from URL: {url}")
832
- return False
833
-
834
- # Determine file type first (important for handling different file types)
835
- file_type, is_view_only = await self.get_google_drive_file_info(file_id)
836
- logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
837
-
838
- base, ext = os.path.splitext(save_path)
839
- if not ext and file_type:
840
- # Add the correct extension if missing
841
- save_path = f"{base}.{file_type}"
842
-
843
- # For view-only files, use specialized approaches
844
- if is_view_only:
845
- # Approach 1: For PDFs, use the JS method
846
- if file_type == 'pdf':
847
- success = await self.download_viewonly_pdf_with_js(file_id, save_path)
848
- if success:
849
- return True
850
-
851
- # Approach 2: For Google Docs, Sheets, etc., use export API
852
- if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
853
- success = await self.export_google_doc(file_id, file_type, save_path)
854
- if success:
855
- return True
856
-
857
- # Approach 3: Try the direct screenshot method for any view-only file
858
- success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
859
- if success:
860
- return True
861
-
862
- # Try standard approaches for non-view-only files
863
- try:
864
- # Try with gdown first
865
- import gdown
866
- output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
867
- if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
868
- with open(save_path, 'rb') as f:
869
- content = f.read(100) # Read first 100 bytes
870
- if b'<!DOCTYPE html>' not in content: # Check not HTML error page
871
- logger.info(f"Successfully downloaded with gdown: {url}")
872
- return True
873
- except Exception as e:
874
- logger.warning(f"gdown download failed: {e}")
875
-
876
- # Try with requests and session cookies
877
- try:
878
- session = requests.Session()
879
- session.headers.update({'User-Agent': get_random_user_agent()})
880
-
881
- # Visit the page first to get cookies
882
- session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
883
-
884
- # Try download
885
- url = f"https://drive.google.com/uc?id={file_id}&export=download"
886
- response = session.get(url, stream=True, timeout=30)
887
-
888
- # Check for confirmation token
889
- confirmation_token = None
890
- for k, v in response.cookies.items():
891
- if k.startswith('download_warning'):
892
- confirmation_token = v
893
- break
894
-
895
- # Use confirmation token if found
896
- if confirmation_token:
897
- url = f"{url}&confirm={confirmation_token}"
898
- response = session.get(url, stream=True, timeout=60)
899
-
900
- # Check if we're getting HTML instead of the file
901
- content_type = response.headers.get('Content-Type', '')
902
- if 'text/html' in content_type:
903
- logger.warning("Received HTML instead of file - likely download restriction")
904
- else:
905
- with open(save_path, 'wb') as f:
906
- for chunk in response.iter_content(chunk_size=1024*1024):
907
- if chunk:
908
- f.write(chunk)
909
-
910
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
911
- with open(save_path, 'rb') as f:
912
- content = f.read(100)
913
- if b'<!DOCTYPE html>' not in content:
914
- logger.info("Successfully downloaded with requests session")
915
- return True
916
- except Exception as e:
917
- logger.warning(f"Requests session download failed: {e}")
918
-
919
- logger.warning("Standard download methods failed")
920
- return False
921
-
922
- async def download_viewonly_pdf_with_js(self, file_id, save_path):
923
- """Improved method that replicates the manual process for downloading view-only PDFs"""
924
- try:
925
- # Create a fresh browser context with extended timeout
926
- browser = await self.playwright.chromium.launch(
927
- headless=True,
928
- args=[
929
- '--no-sandbox',
930
- '--disable-setuid-sandbox',
931
- '--disable-dev-shm-usage',
932
- '--disable-web-security'
933
- ]
934
- )
935
-
936
- # Use high DPI for better quality
937
- context = await browser.new_context(
938
- viewport={'width': 1600, 'height': 1200},
939
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
940
- device_scale_factor=2.0,
941
- timeout=120000 # Longer timeout
942
- )
943
-
944
- page = await context.new_page()
945
-
946
- try:
947
- logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
948
-
949
- # Step 1: Navigate to the PDF and wait for it to load fully
950
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
951
- await page.wait_for_load_state('networkidle')
952
- await page.wait_for_timeout(3000) # Additional wait for JavaScript to initialize
953
-
954
- # Check if we have a PDF viewer
955
- viewer_loaded = await page.query_selector('.drive-viewer-paginated-scrollable, .drive-viewer-paginated-page')
956
- if not viewer_loaded:
957
- logger.warning("PDF viewer not detected. This might not be a PDF or might be using a different viewer.")
958
- # Continue anyway, as it might just be a different CSS class
959
-
960
- # Step 2: Scroll through the entire document to ensure all pages are loaded
961
- logger.info("Scrolling through document to load all pages into cache...")
962
-
963
- # This is CRITICAL - scroll all the way down to ensure all pages are loaded and cached
964
- scroll_success = await page.evaluate("""
965
- async function scrollThroughEntireDocument() {
966
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
967
-
968
- // Try multiple container selectors that might exist in Google Drive
969
- const container = document.querySelector('.drive-viewer-paginated-scrollable') ||
970
- document.querySelector('.drive-viewer-container');
971
-
972
- if (!container) {
973
- console.log('No scroll container found');
974
- return false;
975
- }
976
-
977
- // Get total height to scroll
978
- const totalHeight = container.scrollHeight;
979
- const viewportHeight = container.clientHeight;
980
- console.log(`Document height: ${totalHeight}px, Viewport: ${viewportHeight}px`);
981
-
982
- // First scroll quickly to the bottom to trigger loading all content
983
- container.scrollTo(0, totalHeight);
984
- await delay(2000);
985
-
986
- // Then scroll gradually to ensure everything is properly loaded
987
- const scrollSteps = 20; // Number of steps to divide the scroll
988
- const stepSize = totalHeight / scrollSteps;
989
-
990
- // Scroll down in steps
991
- for (let i = 0; i < scrollSteps; i++) {
992
- const targetPos = i * stepSize;
993
- container.scrollTo(0, targetPos);
994
- console.log(`Scrolled to ${targetPos}px`);
995
- await delay(300); // Wait between scrolls
996
- }
997
-
998
- // Final scroll to the very bottom
999
- container.scrollTo(0, totalHeight);
1000
- await delay(1500);
1001
-
1002
- // Scroll back to top for PDF creation
1003
- container.scrollTo(0, 0);
1004
- await delay(1000);
1005
-
1006
- return true;
1007
- }
1008
- return scrollThroughEntireDocument();
1009
- """)
1010
-
1011
- if not scroll_success:
1012
- logger.warning("Scrolling may not have completed successfully. Will try to download anyway.")
1013
-
1014
- # Step 3: Wait to ensure all content is properly loaded after scrolling
1015
- await page.wait_for_timeout(2000)
1016
-
1017
- # Step 4: Execute the jsPDF script, similar to the manual process
1018
- logger.info("Executing jsPDF script to create and download PDF...")
1019
-
1020
- pdf_result = await page.evaluate("""
1021
- async function downloadPDFWithJsPDF() {
1022
- try {
1023
- // Create and load jsPDF script
1024
- return new Promise((resolve, reject) => {
1025
- let jspdf = document.createElement("script");
1026
- jspdf.onload = function() {
1027
- try {
1028
- // This is the core PDF creation logic
1029
- let pdf = new jsPDF();
1030
- let elements = document.getElementsByTagName("img");
1031
- let pageCount = 0;
1032
-
1033
- // First collect and sort the images
1034
- let validImages = [];
1035
- for (let i = 0; i < elements.length; i++) {
1036
- let img = elements[i];
1037
- // Only include blob images (PDF page images)
1038
- if (!/^blob:/.test(img.src)) {
1039
- continue;
1040
- }
1041
- // Exclude small images (usually icons)
1042
- if (img.width < 100 || img.height < 100) {
1043
- continue;
1044
- }
1045
- validImages.push(img);
1046
- }
1047
-
1048
- // Sort by position from top to bottom
1049
- validImages.sort((a, b) => {
1050
- let rectA = a.getBoundingClientRect();
1051
- let rectB = b.getBoundingClientRect();
1052
- return rectA.top - rectB.top;
1053
- });
1054
-
1055
- console.log(`Found ${validImages.length} valid page images`);
1056
- if (validImages.length === 0) {
1057
- reject("No valid PDF page images found");
1058
- return;
1059
- }
1060
-
1061
- // Process each image
1062
- for (let i = 0; i < validImages.length; i++) {
1063
- let img = validImages[i];
1064
-
1065
- // Create canvas and draw image
1066
- let canvasElement = document.createElement('canvas');
1067
- let con = canvasElement.getContext('2d');
1068
- canvasElement.width = img.width;
1069
- canvasElement.height = img.height;
1070
-
1071
- try {
1072
- // Draw the image to canvas
1073
- con.drawImage(img, 0, 0, img.width, img.height);
1074
-
1075
- // Convert to JPEG
1076
- let imgData = canvasElement.toDataURL("image/jpeg", 1.0);
1077
-
1078
- // Add a new page for each page after the first
1079
- if (pageCount > 0) {
1080
- pdf.addPage();
1081
- }
1082
-
1083
- // Add image to PDF
1084
- pdf.addImage(imgData, 'JPEG', 0, 0, pdf.internal.pageSize.getWidth(), pdf.internal.pageSize.getHeight());
1085
- pageCount++;
1086
- } catch (e) {
1087
- console.error("Error processing image:", e);
1088
- }
1089
- }
1090
-
1091
- if (pageCount === 0) {
1092
- reject("Failed to add any pages to PDF");
1093
- return;
1094
- }
1095
-
1096
- // Return PDF as data URL
1097
- let pdfOutput = pdf.output('datauristring');
1098
- resolve({
1099
- success: true,
1100
- data: pdfOutput,
1101
- pageCount: pageCount
1102
- });
1103
- } catch (e) {
1104
- console.error("Error in PDF creation:", e);
1105
- reject("Error creating PDF: " + e.message);
1106
- }
1107
- };
1108
-
1109
- jspdf.onerror = function() {
1110
- reject("Failed to load jsPDF library");
1111
- };
1112
-
1113
- // Use a reliable CDN for jsPDF
1114
- jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.3.2/jspdf.min.js';
1115
- document.body.appendChild(jspdf);
1116
- });
1117
- } catch (e) {
1118
- console.error("Overall error:", e);
1119
- return { success: false, error: e.message };
1120
- }
1121
- }
1122
-
1123
- return downloadPDFWithJsPDF();
1124
- """)
1125
-
1126
- # Step 5: Process the result
1127
- if not pdf_result or not isinstance(pdf_result, dict) or not pdf_result.get('success'):
1128
- error_msg = pdf_result.get('error') if isinstance(pdf_result, dict) else "Unknown error"
1129
- logger.error(f"Failed to create PDF: {error_msg}")
1130
- return False
1131
-
1132
- # Extract base64 data
1133
- pdf_data = pdf_result.get('data')
1134
- if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
1135
- logger.error("Invalid PDF data returned")
1136
- return False
1137
-
1138
- # Save the PDF
1139
- try:
1140
- base64_data = pdf_data.replace('data:application/pdf;base64,', '')
1141
- pdf_bytes = base64.b64decode(base64_data)
1142
-
1143
- with open(save_path, 'wb') as f:
1144
- f.write(pdf_bytes)
1145
-
1146
- page_count = pdf_result.get('pageCount', 0)
1147
- logger.info(f"Successfully saved PDF with {page_count} pages to {save_path}")
1148
-
1149
- # Verify file
1150
- if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
1151
- return True
1152
- else:
1153
- logger.error("Generated PDF file is too small or empty")
1154
- return False
1155
- except Exception as e:
1156
- logger.error(f"Error saving PDF file: {e}")
1157
- return False
1158
-
1159
- finally:
1160
- await browser.close()
1161
-
1162
- except Exception as e:
1163
- logger.error(f"Error in viewonly PDF download process: {e}")
1164
- return False
1165
-
1166
- async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
1167
- """Download any view-only file by taking screenshots"""
1168
- try:
1169
- async with self.context.new_page() as page:
1170
- # Set high-resolution viewport
1171
- await page.set_viewport_size({"width": 1600, "height": 1200})
1172
-
1173
- # Navigate to the file
1174
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
1175
-
1176
- # Make sure the file is loaded
1177
- await page.wait_for_load_state('networkidle')
1178
- await page.wait_for_timeout(3000) # Extra time for rendering
1179
-
1180
- # Create directory for screenshots if multiple pages
1181
- base_dir = os.path.dirname(save_path)
1182
- base_name = os.path.splitext(os.path.basename(save_path))[0]
1183
- screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
1184
- os.makedirs(screenshots_dir, exist_ok=True)
1185
-
1186
- # Check if it's a multi-page document
1187
- is_multi_page = await page.evaluate("""
1188
- () => {
1189
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1190
- return pages.length > 1;
1191
- }
1192
- """)
1193
-
1194
- if is_multi_page and file_type == 'pdf':
1195
- # For multi-page PDFs, take screenshots of each page
1196
- page_count = await page.evaluate("""
1197
- async () => {
1198
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
1199
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1200
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
1201
-
1202
- if (!container || pages.length === 0) return 0;
1203
-
1204
- // Scroll through to make sure all pages are loaded
1205
- const scrollHeight = container.scrollHeight;
1206
- const viewportHeight = container.clientHeight;
1207
- const scrollStep = viewportHeight;
1208
-
1209
- for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
1210
- container.scrollTo(0, scrollPos);
1211
- await delay(300);
1212
- }
1213
-
1214
- // Scroll back to top
1215
- container.scrollTo(0, 0);
1216
- await delay(300);
1217
-
1218
- return pages.length;
1219
- }
1220
- """)
1221
-
1222
- logger.info(f"Found {page_count} pages in document")
1223
-
1224
- # Take screenshots of each page
1225
- screenshots = []
1226
- for i in range(page_count):
1227
- # Scroll to page
1228
- await page.evaluate(f"""
1229
- async () => {{
1230
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
1231
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1232
- if (pages.length <= {i}) return false;
1233
-
1234
- pages[{i}].scrollIntoView();
1235
- await delay(500);
1236
- return true;
1237
- }}
1238
- """)
1239
-
1240
- # Take screenshot
1241
- screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
1242
- await page.screenshot(path=screenshot_path, clip={
1243
- 'x': 0,
1244
- 'y': 0,
1245
- 'width': 1600,
1246
- 'height': 1200
1247
- })
1248
- screenshots.append(screenshot_path)
1249
-
1250
- # Combine screenshots into PDF
1251
- from PIL import Image
1252
- from reportlab.pdfgen import canvas
1253
-
1254
- c = canvas.Canvas(save_path)
1255
- for screenshot in screenshots:
1256
- img = Image.open(screenshot)
1257
- width, height = img.size
1258
-
1259
- # Add page to PDF
1260
- c.setPageSize((width, height))
1261
- c.drawImage(screenshot, 0, 0, width, height)
1262
- c.showPage()
1263
-
1264
- c.save()
1265
-
1266
- # Clean up screenshots
1267
- for screenshot in screenshots:
1268
- os.remove(screenshot)
1269
- os.rmdir(screenshots_dir)
1270
-
1271
- return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1272
- else:
1273
- # For single-page or non-PDF files, just take one screenshot
1274
- screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
1275
- await page.screenshot(path=screenshot_path, fullPage=True)
1276
-
1277
- # Convert to requested format if needed
1278
- if file_type == 'pdf':
1279
- from PIL import Image
1280
- from reportlab.pdfgen import canvas
1281
-
1282
- # Create PDF from screenshot
1283
- img = Image.open(screenshot_path)
1284
- width, height = img.size
1285
-
1286
- c = canvas.Canvas(save_path, pagesize=(width, height))
1287
- c.drawImage(screenshot_path, 0, 0, width, height)
1288
- c.save()
1289
- else:
1290
- # Just copy the screenshot to the destination with proper extension
1291
- shutil.copy(screenshot_path, save_path)
1292
-
1293
- # Clean up
1294
- os.remove(screenshot_path)
1295
- os.rmdir(screenshots_dir)
1296
-
1297
- return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1298
-
1299
- except Exception as e:
1300
- logger.error(f"Error taking screenshots: {e}")
1301
- return False
1302
-
1303
- async def export_google_doc(self, file_id, file_type, save_path):
1304
- """Export Google Docs/Sheets/Slides to downloadable formats"""
1305
- try:
1306
- # Map file types to export formats
1307
- export_formats = {
1308
- 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
1309
- 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
1310
- 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
1311
- 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
1312
- 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
1313
- 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
1314
- 'pdf': 'application/pdf',
1315
- }
1316
-
1317
- export_format = export_formats.get(file_type, 'application/pdf')
1318
- export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
1319
-
1320
- if 'sheet' in file_type or 'xlsx' in file_type:
1321
- export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
1322
- elif 'ppt' in file_type or 'presentation' in file_type:
1323
- export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
1324
- elif file_type == 'pdf':
1325
- export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
1326
-
1327
- async with self.context.new_page() as page:
1328
- # Get cookies from the main view page first
1329
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
1330
-
1331
- # Now try the export
1332
- response = await page.goto(export_url, wait_until='networkidle')
1333
-
1334
- if response.status == 200:
1335
- content = await response.body()
1336
- with open(save_path, 'wb') as f:
1337
- f.write(content)
1338
- return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1339
- else:
1340
- logger.warning(f"Export failed with status {response.status}")
1341
- return False
1342
-
1343
- except Exception as e:
1344
- logger.error(f"Error exporting Google Doc: {e}")
1345
- return False
1346
-
1347
- async def get_google_drive_file_info(self, file_id):
1348
- """Get file type and view-only status from Google Drive"""
1349
- file_type = None
1350
- is_view_only = False
1351
-
1352
- try:
1353
- async with self.context.new_page() as page:
1354
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
1355
-
1356
- # Check if view-only
1357
- view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
1358
- is_view_only = view_only_text is not None
1359
-
1360
- # Check for Google Docs viewer
1361
- gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
1362
- gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
1363
- gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
1364
-
1365
- if gdocs_viewer:
1366
- file_type = 'docx'
1367
- elif gsheets_viewer:
1368
- file_type = 'xlsx'
1369
- elif gslides_viewer:
1370
- file_type = 'pptx'
1371
- else:
1372
- # Check for PDF viewer
1373
- pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
1374
- if pdf_viewer:
1375
- file_type = 'pdf'
1376
- else:
1377
- # Check for image viewer
1378
- img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
1379
- if img_viewer:
1380
- # Get image type from src
1381
- img_src = await img_viewer.get_attribute('src')
1382
- if 'jpg' in img_src or 'jpeg' in img_src:
1383
- file_type = 'jpg'
1384
- elif 'png' in img_src:
1385
- file_type = 'png'
1386
- else:
1387
- file_type = 'jpg' # Default to jpg
1388
- else:
1389
- # Generic file type fallback
1390
- file_type = 'pdf' # Default to PDF
1391
-
1392
- # If still no type, check filename
1393
- if not file_type:
1394
- title_element = await page.query_selector('div[role="heading"]')
1395
- if title_element:
1396
- title = await title_element.text_content()
1397
- if title:
1398
- ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
1399
- if ext_match:
1400
- file_type = ext_match.group(1).lower()
1401
-
1402
- except Exception as e:
1403
- logger.error(f"Error getting Google Drive file info: {e}")
1404
- file_type = 'pdf' # Default to PDF if we can't determine
1405
-
1406
- return file_type, is_view_only
1407
-
1408
- async def get_sublinks(self, url, limit=10000):
1409
- """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
1410
- links = set()
1411
- try:
1412
- logger.info(f"Fetching sublinks from: {url}")
1413
-
1414
- # Go to page and wait for full load
1415
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
1416
-
1417
- # Get base URL for resolving relative links
1418
- parsed_base = urlparse(url)
1419
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1420
- path_base = os.path.dirname(parsed_base.path)
1421
-
1422
- # Check if page has ASP.NET elements which might need special handling
1423
- is_aspnet = await self.page.evaluate('''
1424
- () => {
1425
- return document.querySelector('form#aspnetForm') !== null ||
1426
- document.querySelector('input[name="__VIEWSTATE"]') !== null;
1427
- }
1428
- ''')
1429
-
1430
- if is_aspnet:
1431
- logger.info("Detected ASP.NET page, using enhanced extraction method")
1432
-
1433
- # Try to interact with ASP.NET controls that might reveal more links
1434
- # Look for dropdowns, buttons, and grid elements
1435
- dropdowns = await self.page.query_selector_all('select')
1436
- buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
1437
-
1438
- # Try interacting with dropdowns first
1439
- for dropdown in dropdowns:
1440
- try:
1441
- # Get all options
1442
- options = await self.page.evaluate('''
1443
- (dropdown) => {
1444
- return Array.from(dropdown.options).map(o => o.value);
1445
- }
1446
- ''', dropdown)
1447
-
1448
- # Try selecting each option
1449
- for option in options:
1450
- if option:
1451
- await dropdown.select_option(value=option)
1452
- await self.page.wait_for_timeout(1000)
1453
- await self.page.wait_for_load_state('networkidle', timeout=5000)
1454
-
1455
- # Extract any new links that appeared
1456
- await self.extract_all_link_types(links, base_url, path_base)
1457
- except Exception as e:
1458
- logger.warning(f"Error interacting with dropdown: {e}")
1459
-
1460
- # Try clicking buttons (but avoid dangerous ones like "delete")
1461
- safe_buttons = []
1462
- for button in buttons:
1463
- button_text = await button.text_content() or ""
1464
- button_value = await button.get_attribute("value") or ""
1465
- button_id = await button.get_attribute("id") or ""
1466
- combined_text = (button_text + button_value + button_id).lower()
1467
-
1468
- # Skip potentially destructive buttons
1469
- if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
1470
- continue
1471
-
1472
- # Prioritize buttons that might show more content
1473
- if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
1474
- safe_buttons.append(button)
1475
-
1476
- # Click the safe buttons
1477
- for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
1478
- try:
1479
- await button.click()
1480
- await self.page.wait_for_timeout(1000)
1481
- await self.page.wait_for_load_state('networkidle', timeout=5000)
1482
-
1483
- # Extract any new links that appeared
1484
- await self.extract_all_link_types(links, base_url, path_base)
1485
- except Exception as e:
1486
- logger.warning(f"Error clicking button: {e}")
1487
-
1488
- # Extract links from the initial page state
1489
- await self.extract_all_link_types(links, base_url, path_base)
1490
-
1491
- # Look specifically for links inside grid/table views which are common in ASP.NET applications
1492
- grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
1493
- for cell in grid_cells:
1494
- try:
1495
- href = await cell.get_attribute('href')
1496
- if href:
1497
- full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1498
- links.add(full_url)
1499
- except Exception as e:
1500
- logger.warning(f"Error extracting grid link: {e}")
1501
-
1502
- # Extract links from onclick attributes and javascript:__doPostBack calls
1503
- postback_links = await self.page.evaluate('''
1504
- () => {
1505
- const results = [];
1506
- // Find elements with onclick containing __doPostBack
1507
- const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
1508
- for (const el of elements) {
1509
- // Extract the postback target
1510
- const onclick = el.getAttribute('onclick') || '';
1511
- const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
1512
- if (match && match[1]) {
1513
- // Get the visible text to use as description
1514
- const text = el.innerText || el.textContent || 'Link';
1515
- results.push({
1516
- id: match[1],
1517
- text: text.trim()
1518
- });
1519
- }
1520
- }
1521
- return results;
1522
- }
1523
- ''')
1524
-
1525
- # Try interacting with some of the postback links
1526
- for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
1527
- try:
1528
- logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
1529
- await self.page.evaluate(f'''
1530
- () => {{
1531
- if (typeof __doPostBack === 'function') {{
1532
- __doPostBack('{postback["id"]}', '');
1533
- }}
1534
- }}
1535
- ''')
1536
- await self.page.wait_for_timeout(1500)
1537
- await self.page.wait_for_load_state('networkidle', timeout=5000)
1538
-
1539
- # Extract any new links that appeared
1540
- await self.extract_all_link_types(links, base_url, path_base)
1541
- except Exception as e:
1542
- logger.warning(f"Error with postback: {e}")
1543
-
1544
- logger.info(f"Found {len(links)} sublinks")
1545
- return list(links)[:limit]
1546
-
1547
- except Exception as e:
1548
- logger.error(f"Error getting sublinks from {url}: {e}")
1549
- return list(links)[:limit] # Return what we have so far
1550
-
1551
- async def extract_all_link_types(self, links_set, base_url, path_base):
1552
- """Extract all types of links from the current page"""
1553
- # Get all <a> tag links
1554
- a_links = await self.page.query_selector_all('a[href]')
1555
- for a in a_links:
1556
- try:
1557
- href = await a.get_attribute('href')
1558
- if href and not href.startswith('javascript:') and not href.startswith('#'):
1559
- full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1560
- links_set.add(full_url)
1561
- except Exception:
1562
- pass
1563
-
1564
- # Get iframe sources
1565
- iframes = await self.page.query_selector_all('iframe[src]')
1566
- for iframe in iframes:
1567
- try:
1568
- src = await iframe.get_attribute('src')
1569
- if src and not src.startswith('javascript:') and not src.startswith('about:'):
1570
- full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
1571
- links_set.add(full_url)
1572
- except Exception:
1573
- pass
1574
-
1575
- # Get links from onclick attributes that reference URLs
1576
- onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
1577
- for el in onclick_elements:
1578
- try:
1579
- onclick = await el.get_attribute('onclick')
1580
- urls = re.findall(r'(https?://[^\'"]+)', onclick)
1581
- for url in urls:
1582
- links_set.add(url)
1583
- except Exception:
1584
- pass
1585
-
1586
- # Look for URLs in data-* attributes
1587
- data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
1588
- for el in data_elements:
1589
- for attr in ['data-url', 'data-href', 'data-src']:
1590
- try:
1591
- value = await el.get_attribute(attr)
1592
- if value and not value.startswith('javascript:'):
1593
- full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
1594
- links_set.add(full_url)
1595
- except Exception:
1596
- pass
1597
-
1598
- # Look for special anchor links that might not have href attributes
1599
- special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
1600
- for anchor in special_anchors:
1601
- try:
1602
- href = await anchor.get_attribute('href')
1603
- if href and not href.startswith('javascript:') and not href.startswith('#'):
1604
- full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1605
- links_set.add(full_url)
1606
- except Exception:
1607
- pass
1608
-
1609
- def resolve_relative_url(self, relative_url, base_url, path_base):
1610
- """Properly resolve relative URLs considering multiple formats"""
1611
- if relative_url.startswith('/'):
1612
- # Absolute path relative to domain
1613
- return f"{base_url}{relative_url}"
1614
- elif relative_url.startswith('./'):
1615
- # Explicit relative path
1616
- return f"{base_url}{path_base}/{relative_url[2:]}"
1617
- elif relative_url.startswith('../'):
1618
- # Parent directory
1619
- parent_path = '/'.join(path_base.split('/')[:-1])
1620
- return f"{base_url}{parent_path}/{relative_url[3:]}"
1621
- else:
1622
- # Regular relative path
1623
- return f"{base_url}{path_base}/{relative_url}"
1624
-
1625
- async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
1626
- if not custom_ext_list:
1627
- custom_ext_list = []
1628
- progress_text = st.empty()
1629
- progress_bar = st.progress(0)
1630
- file_count_text = st.empty()
1631
-
1632
  try:
1633
- progress_text.text("Analyzing main page...")
1634
- # Special handling for ASP.NET pages
1635
- is_aspnet = False
1636
- try:
1637
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
1638
- is_aspnet = await self.page.evaluate('''
1639
- () => {
1640
- return document.querySelector('form#aspnetForm') !== null ||
1641
- document.querySelector('input[name="__VIEWSTATE"]') !== null;
1642
- }
1643
- ''')
1644
- except Exception:
1645
- pass
1646
 
1647
- # Extract files from main page
1648
- main_files = await self.extract_downloadable_files(url, custom_ext_list)
1649
- initial_count = len(main_files)
1650
- file_count_text.text(f"Found {initial_count} files on main page")
1651
 
1652
- # Get sublinks with enhanced method
1653
- progress_text.text("Getting sublinks...")
1654
- sublinks = await self.get_sublinks(url, sublink_limit)
1655
- total_links = len(sublinks)
1656
- progress_text.text(f"Found {total_links} sublinks to process")
1657
 
1658
- if not sublinks:
1659
- progress_bar.progress(1.0)
1660
- return main_files
1661
 
1662
- # Process each sublink
1663
- all_files = main_files
1664
- for i, sublink in enumerate(sublinks, 1):
1665
- progress = i / total_links
1666
- progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
1667
- progress_bar.progress(progress)
 
 
 
1668
 
1669
- try:
1670
- # Use a longer timeout for ASP.NET pages which can be slower
1671
- sub_timeout = timeout * 2 if is_aspnet else timeout
1672
-
1673
- # Extract files from sublink with appropriate timeout
1674
- async with async_timeout(sub_timeout):
1675
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1676
- all_files.extend(sub_files)
1677
- file_count_text.text(f"Found {len(all_files)} total files")
1678
- except Exception as e:
1679
- logger.warning(f"Error processing sublink {sublink}: {e}")
1680
 
1681
- # Deduplicate files
1682
- seen_urls = set()
1683
- unique_files = []
1684
- for f in all_files:
1685
- if f['url'] not in seen_urls:
1686
- seen_urls.add(f['url'])
1687
- unique_files.append(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1688
 
1689
- final_count = len(unique_files)
1690
- progress_text.text(f"Deep search complete!")
1691
- file_count_text.text(f"Found {final_count} unique files")
1692
- progress_bar.progress(1.0)
1693
- return unique_files
1694
-
1695
- except Exception as e:
1696
- logger.error(f"Deep search error: {e}")
1697
- progress_text.text(f"Error during deep search: {str(e)}")
1698
- return []
1699
-
1700
- finally:
1701
- await asyncio.sleep(2)
1702
- if not st.session_state.get('keep_progress', False):
1703
- progress_text.empty()
1704
- progress_bar.empty()class DownloadManager:
1705
- def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
1706
- self.use_proxy = use_proxy
1707
- self.proxy = proxy
1708
- self.query = query
1709
- self.num_results = num_results
1710
- self.playwright = None
1711
- self.browser = None
1712
- self.context = None
1713
- self.page = None
1714
-
1715
- async def __aenter__(self):
1716
- self.playwright = await async_playwright().start()
1717
- opts = {
1718
- "headless": True,
1719
- "args": [
1720
- '--no-sandbox',
1721
- '--disable-setuid-sandbox',
1722
- '--disable-dev-shm-usage',
1723
- '--disable-gpu',
1724
- '--no-zygote',
1725
- '--single-process'
1726
- ]
1727
- }
1728
- if self.use_proxy and self.proxy:
1729
- opts["proxy"] = {"server": self.proxy}
1730
- self.browser = await self.playwright.chromium.launch(**opts)
1731
- self.context = await self.browser.new_context(user_agent=get_random_user_agent())
1732
- self.page = await self.context.new_page()
1733
- await self.page.set_extra_http_headers({
1734
- 'Accept-Language': 'en-US,en;q=0.9',
1735
- 'Accept-Encoding': 'gzip, deflate, br',
1736
- 'Referer': 'https://www.bing.com/'
1737
- })
1738
- return self
1739
-
1740
- async def __aexit__(self, exc_type, exc_val, exc_tb):
1741
- if self.browser:
1742
- await self.browser.close()
1743
- if self.playwright:
1744
- await self.playwright.stop()
1745
-
1746
- async def search_bing(self):
1747
- urls = []
1748
- try:
1749
- search_url = f"https://www.bing.com/search?q={self.query}"
1750
- await self.page.goto(search_url, timeout=30000)
1751
- await self.page.wait_for_load_state('networkidle')
1752
- links = await self.page.query_selector_all("li.b_algo h2 a")
1753
- for link in links[:self.num_results]:
1754
- href = await link.get_attribute('href')
1755
- if href:
1756
- urls.append(href)
1757
- return urls
1758
  except Exception as e:
1759
- logger.error(f"Error searching Bing: {e}")
1760
  return []
1761
 
1762
- async def get_file_size(self, url):
1763
- try:
1764
- async with self.context.new_page() as page:
1765
- response = await page.request.head(url, timeout=15000)
1766
- length = response.headers.get('Content-Length', None)
1767
- if length:
1768
- return sizeof_fmt(int(length))
1769
- else:
1770
- return "Unknown Size"
1771
- except Exception:
1772
- return "Unknown Size"
1773
-
1774
- async def get_pdf_metadata(self, url):
1775
- try:
1776
- async with self.context.new_page() as page:
1777
- resp = await page.request.get(url, timeout=15000)
1778
- if resp.ok:
1779
- content = await resp.body()
1780
- pdf = BytesIO(content)
1781
- reader = PdfReader(pdf)
1782
- return {
1783
- 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
1784
- 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
1785
- 'Pages': len(reader.pages),
1786
- }
1787
- else:
1788
- return {}
1789
- except Exception:
1790
- return {}
1791
-
1792
- async def extract_real_download_url(self, url):
1793
- try:
1794
- async with self.context.new_page() as page:
1795
- response = await page.goto(url, wait_until='networkidle', timeout=30000)
1796
- if response and response.headers.get('location'):
1797
- return response.headers['location']
1798
- return page.url
1799
- except Exception as e:
1800
- logger.error(f"Error extracting real download URL: {e}")
1801
- return url
1802
-
1803
  async def extract_downloadable_files(self, url, custom_ext_list):
1804
  found_files = []
1805
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1806
  response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
1807
  if not response:
1808
  return []
@@ -2862,7 +1485,20 @@ class DownloadManager:
2862
  try:
2863
  logger.info(f"Fetching sublinks from: {url}")
2864
 
2865
- # Go to page and wait for full load
 
 
 
 
 
 
 
 
 
 
 
 
 
2866
  await self.page.goto(url, timeout=30000, wait_until='networkidle')
2867
 
2868
  # Get base URL for resolving relative links
@@ -3152,7 +1788,7 @@ class DownloadManager:
3152
  await asyncio.sleep(2)
3153
  if not st.session_state.get('keep_progress', False):
3154
  progress_text.empty()
3155
- progress_bar.empty()
3156
  # Utility Functions for New Features
3157
  def extract_keywords(text, n=5):
3158
  doc = nlp_model(text)
 
284
  logger.error(f"Error extracting real download URL: {e}")
285
  return url
286
 
287
+ async def get_edu_exam_links(self, url):
288
+ """Specialized method for educational exam websites that follows a common pattern."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  try:
290
+ logger.info(f"Fetching exam links from {url}")
291
+ links = set()
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ # Use requests for a faster initial scan
294
+ import requests
295
+ from bs4 import BeautifulSoup
296
+ from urllib.parse import urljoin, urlparse
297
 
298
+ headers = {"User-Agent": get_random_user_agent()}
299
+ response = requests.get(url, headers=headers, timeout=30)
 
 
 
300
 
301
+ if response.status_code != 200:
302
+ logger.warning(f"Failed to fetch page: {response.status_code}")
303
+ return []
304
 
305
+ # Parse with BeautifulSoup first for efficiency
306
+ soup = BeautifulSoup(response.text, "html.parser")
307
+ parsed_base = urlparse(url)
308
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
309
+
310
+ # Look for all links
311
+ for a in soup.find_all("a", href=True):
312
+ href = a["href"]
313
+ full_url = urljoin(url, href)
314
 
315
+ # Special patterns for exam sites
316
+ for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
317
+ "/test/", "/download/", "/files/", "/assignments/"]:
318
+ if pattern in full_url.lower():
319
+ links.add(full_url)
320
+ break
 
 
 
 
 
321
 
322
+ # If we didn't find many links with direct approach, use Playwright for more thorough extraction
323
+ if len(links) < 5:
324
+ logger.info("Using browser for enhanced link extraction")
325
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
326
+
327
+ # Check for ASP.NET specific elements that might contain exam links
328
+ grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
329
+ if grid_elements:
330
+ for grid in grid_elements:
331
+ grid_links = await grid.query_selector_all('a[href]')
332
+ for a in grid_links:
333
+ href = await a.get_attribute('href')
334
+ if href:
335
+ full_url = href if href.startswith('http') else urljoin(url, href)
336
+ links.add(full_url)
337
+
338
+ # Try clicking any controls that might reveal more exam links
339
+ show_buttons = await self.page.query_selector_all('input[type="button"], button')
340
+ for button in show_buttons:
341
+ button_text = await button.text_content() or ""
342
+ button_value = await button.get_attribute("value") or ""
343
+ if any(keyword in (button_text + button_value).lower() for keyword in
344
+ ["show", "view", "display", "list", "exam", "paper", "test"]):
345
+ try:
346
+ await button.click()
347
+ await self.page.wait_for_timeout(1000)
348
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
349
+
350
+ # Get any new links that appeared
351
+ new_links = await self.page.query_selector_all('a[href]')
352
+ for a in new_links:
353
+ href = await a.get_attribute('href')
354
+ if href:
355
+ full_url = href if href.startswith('http') else urljoin(url, href)
356
+ links.add(full_url)
357
+ except Exception as e:
358
+ logger.warning(f"Error clicking button: {e}")
359
+
360
+ # Filter links to likely contain exam documents
361
+ filtered_links = []
362
+ for link in links:
363
+ # Common file extensions for exam documents
364
+ if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
365
+ filtered_links.append(link)
366
+ continue
367
+
368
+ # Common paths for exam documents
369
+ if any(pattern in link.lower() for pattern in [
370
+ "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
371
+ "/pastpapers/", "/questionpapers/", "/tests/"
372
+ ]):
373
+ filtered_links.append(link)
374
+
375
+ logger.info(f"Found {len(filtered_links)} potential exam document links")
376
+ return filtered_links
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  except Exception as e:
379
+ logger.error(f"Error getting exam links: {e}")
380
  return []
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  async def extract_downloadable_files(self, url, custom_ext_list):
383
  found_files = []
384
  try:
385
+ # Special handling for educational exam sites
386
+ if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
387
+ ["exam", "test", "pastpaper", "eduexp"]):
388
+ logger.info("Using specialized handler for educational exam site")
389
+
390
+ # Get direct links to exam files
391
+ exam_links = await self.get_edu_exam_links(url)
392
+
393
+ for link in exam_links:
394
+ # Try to resolve any redirection
395
+ real_url = await self.extract_real_download_url(link)
396
+ filename = os.path.basename(urlparse(real_url).path)
397
+
398
+ # If filename is URL encoded (common with Chinese/international sites)
399
+ if '%' in filename:
400
+ try:
401
+ from urllib.parse import unquote
402
+ filename = unquote(filename)
403
+ except Exception:
404
+ pass
405
+
406
+ # Get file size
407
+ size_str = await self.get_file_size(real_url)
408
+
409
+ # Get metadata for PDFs
410
+ meta = {}
411
+ if real_url.lower().endswith('.pdf'):
412
+ try:
413
+ meta = await self.get_pdf_metadata(real_url)
414
+ except Exception:
415
+ pass
416
+
417
+ found_files.append({
418
+ 'url': real_url,
419
+ 'filename': filename,
420
+ 'size': size_str,
421
+ 'metadata': meta
422
+ })
423
+
424
+ # If we found exam files with the specialized method, return them
425
+ if found_files:
426
+ return found_files
427
+
428
+ # Standard extraction method if specialized method didn't find files
429
  response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
430
  if not response:
431
  return []
 
1485
  try:
1486
  logger.info(f"Fetching sublinks from: {url}")
1487
 
1488
+ # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
1489
+ if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
1490
+ ["exam", "test", "pastpaper", "eduexp"]):
1491
+ logger.info("Using specialized exam site sublink extraction")
1492
+ edu_links = await self.get_edu_exam_links(url)
1493
+ for link in edu_links:
1494
+ links.add(link)
1495
+
1496
+ # If we found a good number of links with the specialized method, return them
1497
+ if len(links) > 5:
1498
+ logger.info(f"Found {len(links)} sublinks with specialized method")
1499
+ return list(links)[:limit]
1500
+
1501
+ # Standard sublink extraction for all sites
1502
  await self.page.goto(url, timeout=30000, wait_until='networkidle')
1503
 
1504
  # Get base URL for resolving relative links
 
1788
  await asyncio.sleep(2)
1789
  if not st.session_state.get('keep_progress', False):
1790
  progress_text.empty()
1791
+ progress_bar.empty()
1792
  # Utility Functions for New Features
1793
  def extract_keywords(text, n=5):
1794
  doc = nlp_model(text)