euler314 commited on
Commit
b9d5bbe
·
verified ·
1 Parent(s): dca120b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -300
app.py CHANGED
@@ -442,377 +442,326 @@ class DownloadManager:
442
  return None
443
 
444
  async def force_download_viewonly(self, file_info, save_path):
445
- """Last-resort method to download view-only Google Drive files - improved for multi-page PDFs"""
446
  try:
447
- # Extract file ID from URL
448
- file_id = None
449
- url = file_info['url']
450
- for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
451
- match = re.search(pattern, url)
452
- if match:
453
- file_id = match.group(1)
454
- break
 
455
 
456
  if not file_id:
457
  logger.error("Could not extract file ID")
458
  return None
459
 
460
- logger.info(f"Force downloading view-only file with ID: {file_id}")
461
-
462
- # Make sure we have the proper file extension
463
  base, ext = os.path.splitext(save_path)
464
  if not ext:
465
- # Determine file type from metadata or set default to PDF
466
- file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
467
  save_path = f"{base}.{file_type}"
468
 
469
- # Launch a new browser context with higher resolution
 
 
470
  browser = await self.playwright.chromium.launch(
471
  headless=True,
472
- args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security']
 
 
 
 
 
 
 
473
  )
474
 
475
- # Use a larger viewport for better quality
476
  context = await browser.new_context(
477
- viewport={'width': 1920, 'height': 1080},
478
- user_agent=get_random_user_agent(),
479
- device_scale_factor=2.0 # Higher resolution for better quality
480
  )
481
 
482
  page = await context.new_page()
483
 
484
- # Navigate to the file
485
  try:
486
- logger.info(f"Opening view-only file: https://drive.google.com/file/d/{file_id}/view")
487
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
488
- wait_until='networkidle',
489
- timeout=90000) # Longer timeout for large PDFs
490
-
491
- # Wait for content to load fully
492
- await page.wait_for_timeout(5000)
493
-
494
- # Detect if it's a PDF
495
- is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
496
 
497
- if is_pdf:
498
- # For PDFs: Multi-page capture approach
499
- logger.info("Detected PDF, using multi-page capture approach")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
- # First, try to find the viewer container
502
- viewer_container = await page.query_selector('.drive-viewer-paginated-scrollable')
503
 
504
- if not viewer_container:
505
- logger.warning("Could not find standard PDF viewer container, trying alternatives")
506
- viewer_container = await page.query_selector('.drive-viewer-content') or \
507
- await page.query_selector('#drive-pdf-viewer') or \
508
- await page.query_selector('.drive-viewer')
 
 
 
 
509
 
510
- if not viewer_container:
511
- # Take a single screenshot as fallback
512
- logger.warning("Could not find any PDF viewer container, using fallback")
513
- screenshot_path = os.path.join(tempfile.gettempdir(), "gdrive_pdf_fallback.png")
514
- await page.screenshot(path=screenshot_path, full_page=True)
 
 
 
 
 
 
 
 
 
 
515
 
516
  # Convert to PDF
517
  from PIL import Image
518
  from reportlab.pdfgen import canvas as pdf_canvas
 
519
  img = Image.open(screenshot_path)
520
  width, height = img.size
521
  c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
522
  c.drawImage(screenshot_path, 0, 0, width, height)
523
  c.save()
 
524
  os.remove(screenshot_path)
525
- return save_path
 
 
 
 
526
 
527
- # Scroll through to load all pages first
528
- logger.info("Pre-loading all PDF pages...")
529
- await page.evaluate("""
530
- async function preloadAllPages() {
531
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
532
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
533
- if (!container) return;
534
-
535
- // Scroll to bottom to force all pages to load
536
- const initialScroll = container.scrollTop;
537
- container.scrollTo(0, container.scrollHeight);
538
- await delay(3000); // Wait for loading
539
-
540
- // Scroll back to top
541
- container.scrollTo(0, 0);
542
- await delay(1000);
543
- }
544
- return preloadAllPages();
545
- """)
546
 
547
- # Count visible pages - critical step that needs to be fixed
548
- page_count = await page.evaluate("""
549
- () => {
550
- // Try multiple selectors for pages
551
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
552
- if (pages.length > 0) return pages.length;
553
-
554
- // Alternative selectors if standard one fails
555
- const altPages = document.querySelectorAll('.drive-viewer-page');
556
- if (altPages.length > 0) return altPages.length;
557
-
558
- // Try to find page numbers in navigation
559
- const pageNav = document.querySelector('.drive-viewer-paginated-counter');
560
- if (pageNav) {
561
- const text = pageNav.textContent || '';
562
- const match = text.match(/(\d+)\s*\/\s*(\d+)/);
563
- if (match && match[2]) return parseInt(match[2]);
564
- }
565
-
566
- return 0; // Fallback
567
  }
 
 
568
  """)
569
 
570
- # If no pages found but we know it's a PDF, manually check for page counter
571
- if page_count == 0:
572
- # Try to find the page counter text and extract total pages
573
- page_counter_text = await page.evaluate("""
574
- () => {
575
- const elements = Array.from(document.querySelectorAll('*'));
576
- for (const el of elements) {
577
- const text = el.textContent || '';
578
- if (text.match(/\d+\s*\/\s*\d+/)) return text;
579
- }
580
- return '';
581
- }
582
- """)
583
 
584
- if page_counter_text:
585
- match = re.search(r'(\d+)\s*\/\s*(\d+)', page_counter_text)
586
- if match and match.group(2):
587
- page_count = int(match.group(2))
588
- logger.info(f"Detected {page_count} pages from page counter")
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
- # If we still have no page count, default to a reasonable number
591
- if page_count == 0:
592
- logger.warning("Could not detect page count, defaulting to 50 pages to be safe")
593
- page_count = 50 # Try to capture up to 50 pages by default
 
594
 
595
- logger.info(f"Found {page_count} pages in PDF")
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
- # Create a temporary directory for screenshots
598
- temp_dir = tempfile.mkdtemp()
599
- screenshots = []
600
 
601
- # Function to scroll to a specific page and take a screenshot
602
- async def capture_page(page_num):
603
- # Scroll to the page
604
- success = await page.evaluate(f"""
605
- async function scrollToPage(pageNum) {{
606
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
607
-
608
- // Try multiple selectors for pages
609
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
610
- if (pages.length > 0 && pageNum < pages.length) {{
611
- pages[pageNum].scrollIntoView({{behavior: 'instant', block: 'center'}});
612
- await delay(500);
613
- return true;
614
- }}
615
-
616
- // Alternative: try to use page navigation buttons
617
- const pageInput = document.querySelector('input[aria-label="Page"]');
618
- if (pageInput) {{
619
- // Set page number in input
620
- const nativeInputValueSetter = Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, "value").set;
621
- nativeInputValueSetter.call(pageInput, {page_num + 1});
622
-
623
- // Dispatch events
624
- const ev1 = new Event('input', {{ bubbles: true }});
625
- const ev2 = new Event('change', {{ bubbles: true }});
626
- pageInput.dispatchEvent(ev1);
627
- pageInput.dispatchEvent(ev2);
628
-
629
- // Press Enter to navigate
630
- const keyEvent = new KeyboardEvent('keydown', {{
631
- key: 'Enter',
632
- code: 'Enter',
633
- keyCode: 13,
634
- which: 13,
635
- bubbles: true
636
- }});
637
- pageInput.dispatchEvent(keyEvent);
638
-
639
- await delay(1000); // Wait for navigation
640
- return true;
641
- }}
642
-
643
- // Alternative: use page selector dropdown if available
644
- const pageSelector = document.querySelector('.drive-viewer-paginated-page-selector');
645
- if (pageSelector) {{
646
- pageSelector.click();
647
- await delay(300);
648
-
649
- // Find and click the specific page option
650
- const options = document.querySelectorAll('.drive-viewer-paginated-page-selector-option');
651
- if (options.length > pageNum) {{
652
- options[pageNum].click();
653
- await delay(1000);
654
- return true;
655
- }}
656
- }}
657
-
658
- return false;
659
- }}
660
- return scrollToPage({page_num});
661
- """)
662
-
663
- if not success:
664
- # Alternative: Try using the page navigation buttons
665
- logger.info(f"Using alternative navigation for page {page_num + 1}")
666
-
667
- # Find navigation buttons
668
- next_button = await page.query_selector('button[aria-label="Next page"]')
669
- prev_button = await page.query_selector('button[aria-label="Previous page"]')
670
-
671
- # If we're not on the first page, go back to first page
672
- if page_num == 0 and prev_button:
673
- for _ in range(50): # Limit to avoid infinite loop
674
- is_disabled = await prev_button.get_attribute('disabled')
675
- if is_disabled:
676
- break
677
- await prev_button.click()
678
- await page.wait_for_timeout(300)
679
-
680
- # Now navigate forward to desired page
681
- if page_num > 0 and next_button:
682
- for _ in range(page_num):
683
- await next_button.click()
684
- await page.wait_for_timeout(500)
685
-
686
- # Wait for the page content to load
687
- await page.wait_for_timeout(1000)
688
 
689
- # Wait for page to stabilize
690
- await page.wait_for_timeout(500)
691
 
692
- # Take the screenshot
693
- screenshot_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
 
 
 
 
 
 
694
 
695
- # Determine what to screenshot based on the viewer
696
- current_page_element = await page.evaluate("""
697
- () => {
698
- // First try getting the current visible page
699
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
700
- for (const page of pages) {
701
- const rect = page.getBoundingClientRect();
702
- if (rect.top < window.innerHeight && rect.bottom > 0) {
703
- return {
704
- x: Math.max(0, rect.left),
705
- y: Math.max(0, rect.top),
706
- width: Math.min(window.innerWidth, rect.width),
707
- height: Math.min(window.innerHeight, rect.bottom - rect.top)
708
- };
709
- }
710
- }
711
-
712
- // Fallback: try to find the container
713
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
714
- if (container) {
715
- const rect = container.getBoundingClientRect();
716
- return {
717
- x: Math.max(0, rect.left),
718
- y: Math.max(0, rect.top),
719
- width: Math.min(window.innerWidth, rect.width),
720
- height: Math.min(window.innerHeight, rect.bottom - rect.top)
721
- };
722
- }
723
-
724
- // Last resort: screenshot the visible area
725
- return null;
726
- }
727
- """)
728
 
729
- if current_page_element:
730
- # Screenshot the specific page element
731
- await page.screenshot(path=screenshot_path, clip=current_page_element)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732
  else:
733
- # Screenshot the entire visible area
734
- await page.screenshot(path=screenshot_path)
 
 
 
735
 
736
- return screenshot_path
 
 
 
737
 
738
- # Capture all pages
739
- for i in range(page_count):
740
- logger.info(f"Capturing page {i+1} of {page_count}")
741
- screenshot_path = await capture_page(i)
742
- screenshots.append(screenshot_path)
743
- # Add progress indicator
744
- if (i+1) % 5 == 0 or i+1 == page_count:
745
- logger.info(f"Progress: {i+1}/{page_count} pages captured")
746
 
747
- # Combine screenshots into a PDF
748
  from PIL import Image
749
  from reportlab.lib.pagesizes import letter
750
  from reportlab.pdfgen import canvas as pdf_canvas
751
 
752
- logger.info(f"Combining {len(screenshots)} screenshots into PDF")
753
-
754
- # Use the first image dimensions to set PDF size if available
755
  if screenshots:
756
- img = Image.open(screenshots[0])
757
- img_width, img_height = img.size
758
- c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
759
-
760
- for screenshot in screenshots:
761
- # Check if file exists and has content
762
- if os.path.exists(screenshot) and os.path.getsize(screenshot) > 0:
763
- img = Image.open(screenshot)
764
- c.drawImage(screenshot, 0, 0, img_width, img_height)
765
- c.showPage()
766
-
767
- c.save()
768
-
769
- # Clean up screenshots
770
- for screenshot in screenshots:
771
- if os.path.exists(screenshot):
772
- os.remove(screenshot)
773
- os.rmdir(temp_dir)
774
-
775
- # Verify the PDF was created successfully
776
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
777
- logger.info(f"Successfully created PDF with {len(screenshots)} pages")
778
- return save_path
 
 
779
  else:
780
- logger.error("Failed to create PDF from screenshots")
781
- return None
782
  else:
783
- # For non-PDF files: take a single screenshot
784
- logger.info("Non-PDF file detected, taking single screenshot")
785
- screenshot_path = os.path.join(tempfile.gettempdir(), "screenshot.png")
786
- await page.screenshot(path=screenshot_path, full_page=True)
787
 
788
- # Convert to requested format if needed
789
- if save_path.lower().endswith('.pdf'):
790
- # Convert to PDF
791
- from PIL import Image
792
- from reportlab.pdfgen import canvas as pdf_canvas
793
- img = Image.open(screenshot_path)
794
- width, height = img.size
795
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
796
- c.drawImage(screenshot_path, 0, 0, width, height)
797
- c.save()
798
  else:
799
- # Just copy the screenshot with the appropriate extension
800
  shutil.copy(screenshot_path, save_path)
801
 
802
- # Clean up
803
  os.remove(screenshot_path)
804
 
 
 
 
 
 
 
805
  # Close browser
806
  await browser.close()
807
 
808
- # Verify file exists and is not empty
809
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
810
  logger.info(f"Successfully downloaded file to {save_path}")
811
  return save_path
812
  else:
813
- logger.error(f"Failed to create valid file at {save_path}")
814
  return None
815
-
816
  except Exception as e:
817
  logger.error(f"Error during force download: {e}")
818
  if browser:
@@ -820,7 +769,7 @@ class DownloadManager:
820
  return None
821
 
822
  except Exception as e:
823
- logger.error(f"Force download failed: {e}")
824
  return None
825
 
826
  async def download_from_google_drive(self, url, save_path):
 
442
  return None
443
 
444
  async def force_download_viewonly(self, file_info, save_path):
445
+ """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
446
  try:
447
+ # Extract file ID
448
+ file_id = file_info.get('metadata', {}).get('file_id')
449
+ if not file_id:
450
+ url = file_info['url']
451
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
452
+ match = re.search(pattern, url)
453
+ if match:
454
+ file_id = match.group(1)
455
+ break
456
 
457
  if not file_id:
458
  logger.error("Could not extract file ID")
459
  return None
460
 
461
+ file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
 
 
462
  base, ext = os.path.splitext(save_path)
463
  if not ext:
 
 
464
  save_path = f"{base}.{file_type}"
465
 
466
+ logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
467
+
468
+ # Create a dedicated browser instance with better resolution
469
  browser = await self.playwright.chromium.launch(
470
  headless=True,
471
+ args=[
472
+ '--no-sandbox',
473
+ '--disable-setuid-sandbox',
474
+ '--disable-dev-shm-usage',
475
+ '--disable-web-security',
476
+ '--disable-features=IsolateOrigins,site-per-process',
477
+ '--disable-site-isolation-trials'
478
+ ]
479
  )
480
 
481
+ # Use higher resolution for better quality
482
  context = await browser.new_context(
483
+ viewport={'width': 1600, 'height': 1200},
484
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
485
+ device_scale_factor=2.0
486
  )
487
 
488
  page = await context.new_page()
489
 
 
490
  try:
491
+ # Go to the file view page
492
+ logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
493
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
494
+ await page.wait_for_load_state('networkidle')
495
+ await page.wait_for_timeout(5000) # Wait longer for everything to load
496
+
497
+ # Create temp directory
498
+ temp_dir = tempfile.mkdtemp()
 
 
499
 
500
+ # Special handling for PDFs
501
+ if file_type.lower() == 'pdf':
502
+ # Check if there's a pagination control
503
+ pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
504
+
505
+ # Try multiple methods to extract total pages
506
+ total_pages = await page.evaluate("""
507
+ () => {
508
+ // Method 1: Check page counter text
509
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
510
+ const text = el.textContent || '';
511
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
512
+ });
513
+
514
+ if (pageCounters.length > 0) {
515
+ const text = pageCounters[0].textContent || '';
516
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
517
+ if (match && match[2]) return parseInt(match[2]);
518
+ }
519
+
520
+ // Method 2: Check actual page elements
521
+ const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
522
+ if (pageElements.length > 0) return pageElements.length;
523
+
524
+ // Method 3: Look for page thumbnails
525
+ const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
526
+ if (thumbnails.length > 0) return thumbnails.length;
527
+
528
+ // Fallback: conservative guess based on UI
529
+ return 50; // Safe default when we can't determine
530
+ }
531
+ """)
532
 
533
+ logger.info(f"Detected {total_pages} pages in PDF")
 
534
 
535
+ if total_pages <= 1:
536
+ # Additional check - sometimes the page count detection fails
537
+ # Let's double-check by looking for next/previous buttons
538
+ next_button = await page.query_selector('button[aria-label="Next page"]')
539
+ if next_button:
540
+ disabled = await next_button.get_attribute('disabled')
541
+ if not disabled:
542
+ logger.info("Found next button that's not disabled, document has multiple pages")
543
+ total_pages = 100 # Set a high number, we'll stop when we can't go further
544
 
545
+ # If we still think it's a single page, use a more direct approach
546
+ if total_pages <= 1:
547
+ # Single page approach
548
+ logger.info("Using single-page capture approach")
549
+
550
+ # Take a screenshot of the current view (should be the full document or first page)
551
+ screenshot_path = os.path.join(temp_dir, "page.png")
552
+
553
+ # Try to screenshot just the document area if we can find it
554
+ document_area = await page.query_selector('.drive-viewer-paginated-page')
555
+ if document_area:
556
+ await document_area.screenshot(path=screenshot_path)
557
+ else:
558
+ # Otherwise take a full screenshot
559
+ await page.screenshot(path=screenshot_path)
560
 
561
  # Convert to PDF
562
  from PIL import Image
563
  from reportlab.pdfgen import canvas as pdf_canvas
564
+
565
  img = Image.open(screenshot_path)
566
  width, height = img.size
567
  c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
568
  c.drawImage(screenshot_path, 0, 0, width, height)
569
  c.save()
570
+
571
  os.remove(screenshot_path)
572
+ os.rmdir(temp_dir)
573
+
574
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
575
+ return save_path
576
+ return None
577
 
578
+ # Multi-page approach
579
+ logger.info(f"Using multi-page capture approach for {total_pages} pages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
+ # CRITICAL: We need to go to the first page first
582
+ # Check if we need to reset to first page
583
+ current_page_text = await page.evaluate("""
584
+ () => {
585
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
586
+ const text = el.textContent || '';
587
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
588
+ });
589
+
590
+ if (pageCounters.length > 0) {
591
+ return pageCounters[0].textContent || '';
 
 
 
 
 
 
 
 
 
592
  }
593
+ return '';
594
+ }
595
  """)
596
 
597
+ current_page = 1
598
+ if current_page_text:
599
+ match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
600
+ if match:
601
+ current_page = int(match.group(1))
602
+
603
+ # If we're not on page 1, go back to first page
604
+ if current_page > 1:
605
+ logger.info(f"Currently on page {current_page}, navigating back to page 1")
 
 
 
 
606
 
607
+ # Look for an input field where we can directly set the page number
608
+ page_input = await page.query_selector('input[aria-label="Page"]')
609
+ if page_input:
610
+ await page_input.fill("1")
611
+ await page_input.press("Enter")
612
+ await page.wait_for_timeout(1000)
613
+ else:
614
+ # Use prev button to go back to first page
615
+ prev_button = await page.query_selector('button[aria-label="Previous page"]')
616
+ if prev_button:
617
+ # Keep clicking until we can't anymore
618
+ for _ in range(current_page - 1):
619
+ try:
620
+ await prev_button.click()
621
+ await page.wait_for_timeout(500)
622
+ except Exception as e:
623
+ logger.warning(f"Error clicking prev button: {e}")
624
+ break
625
 
626
+ # Capture each page
627
+ screenshots = []
628
+ page_num = 1
629
+ max_tries = min(total_pages + 10, 200) # Set a reasonable limit
630
+ next_button = await page.query_selector('button[aria-label="Next page"]')
631
 
632
+ # Maximize the PDF view if possible
633
+ await page.evaluate("""
634
+ () => {
635
+ // Try to find and click any "full page" or "maximize" buttons
636
+ const fullViewButtons = Array.from(document.querySelectorAll('button'))
637
+ .filter(b => b.textContent?.includes('Full') ||
638
+ b.getAttribute('aria-label')?.includes('Full') ||
639
+ b.getAttribute('aria-label')?.includes('fit page'));
640
+ if (fullViewButtons.length > 0) {
641
+ fullViewButtons[0].click();
642
+ }
643
+ }
644
+ """)
645
 
646
+ await page.wait_for_timeout(1000) # Wait for view to adjust
 
 
647
 
648
+ while page_num <= max_tries:
649
+ # Wait for the page to be fully loaded
650
+ await page.wait_for_timeout(800)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
+ # Take a screenshot of the current page
653
+ screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
654
 
655
+ # Try different methods to identify and capture just the page content
656
+ page_content = await page.query_selector('.drive-viewer-paginated-page')
657
+ if page_content:
658
+ # Found the specific page element
659
+ await page_content.screenshot(path=screenshot_path)
660
+ else:
661
+ # Fall back to screenshot of visible viewport
662
+ await page.screenshot(path=screenshot_path)
663
 
664
+ screenshots.append(screenshot_path)
665
+ logger.info(f"Captured page {page_num}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
 
667
+ # Check if we have a disabled next button (reached the end)
668
+ if next_button:
669
+ is_disabled = await next_button.get_attribute('disabled')
670
+ if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
671
+ logger.info(f"Reached end of document after {page_num} pages")
672
+ break
673
+
674
+ # Click the next button
675
+ try:
676
+ await next_button.click()
677
+ await page.wait_for_timeout(800) # Wait for page transition
678
+ page_num += 1
679
+ except Exception as e:
680
+ logger.error(f"Error clicking next button: {e}")
681
+ # Try to get a fresh reference to the button
682
+ next_button = await page.query_selector('button[aria-label="Next page"]')
683
+ if not next_button:
684
+ logger.warning("Next button disappeared, assuming end of document")
685
+ break
686
  else:
687
+ # Try to find the next button again
688
+ next_button = await page.query_selector('button[aria-label="Next page"]')
689
+ if not next_button:
690
+ logger.warning("Could not find next button, stopping navigation")
691
+ break
692
 
693
+ # Double-check if we've reached the expected total
694
+ if page_num >= total_pages:
695
+ logger.info(f"Reached expected total of {total_pages} pages")
696
+ break
697
 
698
+ # Combine screenshots into PDF
699
+ logger.info(f"Creating PDF from {len(screenshots)} captured pages")
 
 
 
 
 
 
700
 
 
701
  from PIL import Image
702
  from reportlab.lib.pagesizes import letter
703
  from reportlab.pdfgen import canvas as pdf_canvas
704
 
705
+ # Use the size of the first screenshot to set PDF dimensions
 
 
706
  if screenshots:
707
+ try:
708
+ img = Image.open(screenshots[0])
709
+ width, height = img.size
710
+
711
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
712
+
713
+ for screenshot in screenshots:
714
+ try:
715
+ if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
716
+ img = Image.open(screenshot)
717
+ c.drawImage(screenshot, 0, 0, width, height)
718
+ c.showPage()
719
+ except Exception as e:
720
+ logger.error(f"Error adding page to PDF: {e}")
721
+
722
+ c.save()
723
+
724
+ # Clean up screenshots
725
+ for screenshot in screenshots:
726
+ if os.path.exists(screenshot):
727
+ os.remove(screenshot)
728
+
729
+ logger.info(f"Successfully created PDF with {len(screenshots)} pages")
730
+ except Exception as e:
731
+ logger.error(f"Error creating PDF: {e}")
732
  else:
733
+ logger.error("No screenshots captured to create PDF")
 
734
  else:
735
+ # Non-PDF file handling
736
+ screenshot_path = os.path.join(temp_dir, "file.png")
737
+ await page.screenshot(path=screenshot_path)
 
738
 
739
+ if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
740
+ # For document types, try to export directly
741
+ await self.export_google_doc(file_id, file_type, save_path)
 
 
 
 
 
 
 
742
  else:
743
+ # For other types, save the screenshot with appropriate extension
744
  shutil.copy(screenshot_path, save_path)
745
 
 
746
  os.remove(screenshot_path)
747
 
748
+ # Clean up temp directory
749
+ try:
750
+ os.rmdir(temp_dir)
751
+ except:
752
+ pass
753
+
754
  # Close browser
755
  await browser.close()
756
 
757
+ # Verify file exists and has content
758
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
759
  logger.info(f"Successfully downloaded file to {save_path}")
760
  return save_path
761
  else:
762
+ logger.error(f"Generated file is too small or missing: {save_path}")
763
  return None
764
+
765
  except Exception as e:
766
  logger.error(f"Error during force download: {e}")
767
  if browser:
 
769
  return None
770
 
771
  except Exception as e:
772
+ logger.error(f"Force download preparation failed: {e}")
773
  return None
774
 
775
  async def download_from_google_drive(self, url, save_path):