euler314 commited on
Commit
dca120b
·
verified ·
1 Parent(s): ed38edb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -86
app.py CHANGED
@@ -442,7 +442,7 @@ class DownloadManager:
442
  return None
443
 
444
  async def force_download_viewonly(self, file_info, save_path):
445
- """Last-resort method to download view-only Google Drive files"""
446
  try:
447
  # Extract file ID from URL
448
  file_id = None
@@ -469,11 +469,12 @@ class DownloadManager:
469
  # Launch a new browser context with higher resolution
470
  browser = await self.playwright.chromium.launch(
471
  headless=True,
472
- args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
473
  )
474
 
 
475
  context = await browser.new_context(
476
- viewport={'width': 1600, 'height': 1200},
477
  user_agent=get_random_user_agent(),
478
  device_scale_factor=2.0 # Higher resolution for better quality
479
  )
@@ -482,135 +483,310 @@ class DownloadManager:
482
 
483
  # Navigate to the file
484
  try:
 
485
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
486
  wait_until='networkidle',
487
- timeout=60000)
488
 
489
  # Wait for content to load fully
490
  await page.wait_for_timeout(5000)
491
 
492
- # Check if it's a PDF
493
  is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
494
 
495
  if is_pdf:
496
- # For PDFs: Screenshot each page approach
497
- logger.info("Detected PDF, using page-by-page screenshot approach")
498
 
499
- # Scroll through document to ensure all pages are loaded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  await page.evaluate("""
501
- async function scrollDocument() {
502
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
503
  const container = document.querySelector('.drive-viewer-paginated-scrollable');
504
  if (!container) return;
505
 
506
- // First scroll to bottom to load all pages
 
507
  container.scrollTo(0, container.scrollHeight);
508
- await delay(2000);
509
 
510
- // Then back to top
511
  container.scrollTo(0, 0);
512
  await delay(1000);
513
  }
514
- return scrollDocument();
515
  """)
516
 
517
- # Count pages
518
  page_count = await page.evaluate("""
519
  () => {
 
520
  const pages = document.querySelectorAll('.drive-viewer-paginated-page');
521
- return pages.length;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  }
523
  """)
524
 
 
525
  if page_count == 0:
526
- logger.warning("No pages found, trying alternative method")
527
- # Take a screenshot of the entire page
528
- temp_dir = tempfile.mkdtemp()
529
- screenshot_path = os.path.join(temp_dir, "page.png")
530
- await page.screenshot(path=screenshot_path, full_page=True)
531
-
532
- # Convert screenshot to PDF
533
- from PIL import Image
534
- from reportlab.pdfgen import canvas as pdf_canvas
535
- img = Image.open(screenshot_path)
536
- width, height = img.size
537
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
538
- c.drawImage(screenshot_path, 0, 0, width, height)
539
- c.save()
540
-
541
- # Clean up
542
- os.remove(screenshot_path)
543
- os.rmdir(temp_dir)
544
- else:
545
- # Create temp directory for page screenshots
546
- temp_dir = tempfile.mkdtemp()
547
- screenshots = []
548
 
549
- # Take screenshot of each page
550
- for i in range(page_count):
551
- # Scroll to page
552
- await page.evaluate(f"""
553
- async () => {{
554
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
555
- if (pages.length <= {i}) return;
556
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
557
- pages[{i}].scrollIntoView();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  await delay(500);
 
559
  }}
560
- """)
561
-
562
- # Take screenshot
563
- screenshot_path = os.path.join(temp_dir, f"page_{i+1}.png")
564
-
565
- # Position page for best screenshot
566
- await page.evaluate(f"""
567
- () => {{
568
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
569
- const page = pages[{i}];
570
- const viewer = document.querySelector('.drive-viewer-paginated-scrollable');
571
- if (page && viewer) {{
572
- // Center the page in the viewport
573
- const rect = page.getBoundingClientRect();
574
- viewer.scrollBy(0, rect.top - 100);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  }}
576
  }}
577
- """)
 
 
 
 
 
 
 
 
578
 
579
- await page.screenshot(path=screenshot_path)
580
- screenshots.append(screenshot_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
 
582
- # Combine screenshots into PDF
583
- from reportlab.lib.pagesizes import letter
584
- from reportlab.pdfgen import canvas as pdf_canvas
585
- from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
- # Use the first image dimensions to determine page size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
  img = Image.open(screenshots[0])
589
  img_width, img_height = img.size
590
-
591
  c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
592
 
593
  for screenshot in screenshots:
594
- img = Image.open(screenshot)
595
- c.drawImage(screenshot, 0, 0, img_width, img_height)
596
- c.showPage()
 
 
597
 
598
  c.save()
599
-
600
- # Clean up
601
- for screenshot in screenshots:
 
602
  os.remove(screenshot)
603
- os.rmdir(temp_dir)
 
 
 
 
 
 
 
 
604
  else:
605
- # For other file types: Take a single screenshot
606
- temp_dir = tempfile.mkdtemp()
607
- screenshot_path = os.path.join(temp_dir, "screenshot.png")
608
  await page.screenshot(path=screenshot_path, full_page=True)
609
 
610
- # Determine final file type
611
- base, ext = os.path.splitext(save_path)
612
-
613
- if ext.lower() == '.pdf':
614
  # Convert to PDF
615
  from PIL import Image
616
  from reportlab.pdfgen import canvas as pdf_canvas
@@ -625,14 +801,13 @@ class DownloadManager:
625
 
626
  # Clean up
627
  os.remove(screenshot_path)
628
- os.rmdir(temp_dir)
629
 
630
  # Close browser
631
  await browser.close()
632
 
633
  # Verify file exists and is not empty
634
  if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
635
- logger.info(f"Successfully downloaded view-only file to {save_path}")
636
  return save_path
637
  else:
638
  logger.error(f"Failed to create valid file at {save_path}")
 
442
  return None
443
 
444
  async def force_download_viewonly(self, file_info, save_path):
445
+ """Last-resort method to download view-only Google Drive files - improved for multi-page PDFs"""
446
  try:
447
  # Extract file ID from URL
448
  file_id = None
 
469
  # Launch a new browser context with higher resolution
470
  browser = await self.playwright.chromium.launch(
471
  headless=True,
472
+ args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security']
473
  )
474
 
475
+ # Use a larger viewport for better quality
476
  context = await browser.new_context(
477
+ viewport={'width': 1920, 'height': 1080},
478
  user_agent=get_random_user_agent(),
479
  device_scale_factor=2.0 # Higher resolution for better quality
480
  )
 
483
 
484
  # Navigate to the file
485
  try:
486
+ logger.info(f"Opening view-only file: https://drive.google.com/file/d/{file_id}/view")
487
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
488
  wait_until='networkidle',
489
+ timeout=90000) # Longer timeout for large PDFs
490
 
491
  # Wait for content to load fully
492
  await page.wait_for_timeout(5000)
493
 
494
+ # Detect if it's a PDF
495
  is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
496
 
497
  if is_pdf:
498
+ # For PDFs: Multi-page capture approach
499
+ logger.info("Detected PDF, using multi-page capture approach")
500
 
501
+ # First, try to find the viewer container
502
+ viewer_container = await page.query_selector('.drive-viewer-paginated-scrollable')
503
+
504
+ if not viewer_container:
505
+ logger.warning("Could not find standard PDF viewer container, trying alternatives")
506
+ viewer_container = await page.query_selector('.drive-viewer-content') or \
507
+ await page.query_selector('#drive-pdf-viewer') or \
508
+ await page.query_selector('.drive-viewer')
509
+
510
+ if not viewer_container:
511
+ # Take a single screenshot as fallback
512
+ logger.warning("Could not find any PDF viewer container, using fallback")
513
+ screenshot_path = os.path.join(tempfile.gettempdir(), "gdrive_pdf_fallback.png")
514
+ await page.screenshot(path=screenshot_path, full_page=True)
515
+
516
+ # Convert to PDF
517
+ from PIL import Image
518
+ from reportlab.pdfgen import canvas as pdf_canvas
519
+ img = Image.open(screenshot_path)
520
+ width, height = img.size
521
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
522
+ c.drawImage(screenshot_path, 0, 0, width, height)
523
+ c.save()
524
+ os.remove(screenshot_path)
525
+ return save_path
526
+
527
+ # Scroll through to load all pages first
528
+ logger.info("Pre-loading all PDF pages...")
529
  await page.evaluate("""
530
+ async function preloadAllPages() {
531
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
532
  const container = document.querySelector('.drive-viewer-paginated-scrollable');
533
  if (!container) return;
534
 
535
+ // Scroll to bottom to force all pages to load
536
+ const initialScroll = container.scrollTop;
537
  container.scrollTo(0, container.scrollHeight);
538
+ await delay(3000); // Wait for loading
539
 
540
+ // Scroll back to top
541
  container.scrollTo(0, 0);
542
  await delay(1000);
543
  }
544
+ return preloadAllPages();
545
  """)
546
 
547
+ # Count visible pages - critical step that needs to be fixed
548
  page_count = await page.evaluate("""
549
  () => {
550
+ // Try multiple selectors for pages
551
  const pages = document.querySelectorAll('.drive-viewer-paginated-page');
552
+ if (pages.length > 0) return pages.length;
553
+
554
+ // Alternative selectors if standard one fails
555
+ const altPages = document.querySelectorAll('.drive-viewer-page');
556
+ if (altPages.length > 0) return altPages.length;
557
+
558
+ // Try to find page numbers in navigation
559
+ const pageNav = document.querySelector('.drive-viewer-paginated-counter');
560
+ if (pageNav) {
561
+ const text = pageNav.textContent || '';
562
+ const match = text.match(/(\d+)\s*\/\s*(\d+)/);
563
+ if (match && match[2]) return parseInt(match[2]);
564
+ }
565
+
566
+ return 0; // Fallback
567
  }
568
  """)
569
 
570
+ # If no pages found but we know it's a PDF, manually check for page counter
571
  if page_count == 0:
572
+ # Try to find the page counter text and extract total pages
573
+ page_counter_text = await page.evaluate("""
574
+ () => {
575
+ const elements = Array.from(document.querySelectorAll('*'));
576
+ for (const el of elements) {
577
+ const text = el.textContent || '';
578
+ if (text.match(/\d+\s*\/\s*\d+/)) return text;
579
+ }
580
+ return '';
581
+ }
582
+ """)
 
 
 
 
 
 
 
 
 
 
 
583
 
584
+ if page_counter_text:
585
+ match = re.search(r'(\d+)\s*\/\s*(\d+)', page_counter_text)
586
+ if match and match.group(2):
587
+ page_count = int(match.group(2))
588
+ logger.info(f"Detected {page_count} pages from page counter")
589
+
590
+ # If we still have no page count, default to a reasonable number
591
+ if page_count == 0:
592
+ logger.warning("Could not detect page count, defaulting to 50 pages to be safe")
593
+ page_count = 50 # Try to capture up to 50 pages by default
594
+
595
+ logger.info(f"Found {page_count} pages in PDF")
596
+
597
+ # Create a temporary directory for screenshots
598
+ temp_dir = tempfile.mkdtemp()
599
+ screenshots = []
600
+
601
+ # Function to scroll to a specific page and take a screenshot
602
+ async def capture_page(page_num):
603
+ # Scroll to the page
604
+ success = await page.evaluate(f"""
605
+ async function scrollToPage(pageNum) {{
606
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
607
+
608
+ // Try multiple selectors for pages
609
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
610
+ if (pages.length > 0 && pageNum < pages.length) {{
611
+ pages[pageNum].scrollIntoView({{behavior: 'instant', block: 'center'}});
612
  await delay(500);
613
+ return true;
614
  }}
615
+
616
+ // Alternative: try to use page navigation buttons
617
+ const pageInput = document.querySelector('input[aria-label="Page"]');
618
+ if (pageInput) {{
619
+ // Set page number in input
620
+ const nativeInputValueSetter = Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, "value").set;
621
+ nativeInputValueSetter.call(pageInput, {page_num + 1});
622
+
623
+ // Dispatch events
624
+ const ev1 = new Event('input', {{ bubbles: true }});
625
+ const ev2 = new Event('change', {{ bubbles: true }});
626
+ pageInput.dispatchEvent(ev1);
627
+ pageInput.dispatchEvent(ev2);
628
+
629
+ // Press Enter to navigate
630
+ const keyEvent = new KeyboardEvent('keydown', {{
631
+ key: 'Enter',
632
+ code: 'Enter',
633
+ keyCode: 13,
634
+ which: 13,
635
+ bubbles: true
636
+ }});
637
+ pageInput.dispatchEvent(keyEvent);
638
+
639
+ await delay(1000); // Wait for navigation
640
+ return true;
641
+ }}
642
+
643
+ // Alternative: use page selector dropdown if available
644
+ const pageSelector = document.querySelector('.drive-viewer-paginated-page-selector');
645
+ if (pageSelector) {{
646
+ pageSelector.click();
647
+ await delay(300);
648
+
649
+ // Find and click the specific page option
650
+ const options = document.querySelectorAll('.drive-viewer-paginated-page-selector-option');
651
+ if (options.length > pageNum) {{
652
+ options[pageNum].click();
653
+ await delay(1000);
654
+ return true;
655
  }}
656
  }}
657
+
658
+ return false;
659
+ }}
660
+ return scrollToPage({page_num});
661
+ """)
662
+
663
+ if not success:
664
+ # Alternative: Try using the page navigation buttons
665
+ logger.info(f"Using alternative navigation for page {page_num + 1}")
666
 
667
+ # Find navigation buttons
668
+ next_button = await page.query_selector('button[aria-label="Next page"]')
669
+ prev_button = await page.query_selector('button[aria-label="Previous page"]')
670
+
671
+ # If we're not on the first page, go back to first page
672
+ if page_num == 0 and prev_button:
673
+ for _ in range(50): # Limit to avoid infinite loop
674
+ is_disabled = await prev_button.get_attribute('disabled')
675
+ if is_disabled:
676
+ break
677
+ await prev_button.click()
678
+ await page.wait_for_timeout(300)
679
+
680
+ # Now navigate forward to desired page
681
+ if page_num > 0 and next_button:
682
+ for _ in range(page_num):
683
+ await next_button.click()
684
+ await page.wait_for_timeout(500)
685
+
686
+ # Wait for the page content to load
687
+ await page.wait_for_timeout(1000)
688
 
689
+ # Wait for page to stabilize
690
+ await page.wait_for_timeout(500)
691
+
692
+ # Take the screenshot
693
+ screenshot_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
694
+
695
+ # Determine what to screenshot based on the viewer
696
+ current_page_element = await page.evaluate("""
697
+ () => {
698
+ // First try getting the current visible page
699
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
700
+ for (const page of pages) {
701
+ const rect = page.getBoundingClientRect();
702
+ if (rect.top < window.innerHeight && rect.bottom > 0) {
703
+ return {
704
+ x: Math.max(0, rect.left),
705
+ y: Math.max(0, rect.top),
706
+ width: Math.min(window.innerWidth, rect.width),
707
+ height: Math.min(window.innerHeight, rect.bottom - rect.top)
708
+ };
709
+ }
710
+ }
711
+
712
+ // Fallback: try to find the container
713
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
714
+ if (container) {
715
+ const rect = container.getBoundingClientRect();
716
+ return {
717
+ x: Math.max(0, rect.left),
718
+ y: Math.max(0, rect.top),
719
+ width: Math.min(window.innerWidth, rect.width),
720
+ height: Math.min(window.innerHeight, rect.bottom - rect.top)
721
+ };
722
+ }
723
+
724
+ // Last resort: screenshot the visible area
725
+ return null;
726
+ }
727
+ """)
728
 
729
+ if current_page_element:
730
+ # Screenshot the specific page element
731
+ await page.screenshot(path=screenshot_path, clip=current_page_element)
732
+ else:
733
+ # Screenshot the entire visible area
734
+ await page.screenshot(path=screenshot_path)
735
+
736
+ return screenshot_path
737
+
738
+ # Capture all pages
739
+ for i in range(page_count):
740
+ logger.info(f"Capturing page {i+1} of {page_count}")
741
+ screenshot_path = await capture_page(i)
742
+ screenshots.append(screenshot_path)
743
+ # Add progress indicator
744
+ if (i+1) % 5 == 0 or i+1 == page_count:
745
+ logger.info(f"Progress: {i+1}/{page_count} pages captured")
746
+
747
+ # Combine screenshots into a PDF
748
+ from PIL import Image
749
+ from reportlab.lib.pagesizes import letter
750
+ from reportlab.pdfgen import canvas as pdf_canvas
751
+
752
+ logger.info(f"Combining {len(screenshots)} screenshots into PDF")
753
+
754
+ # Use the first image dimensions to set PDF size if available
755
+ if screenshots:
756
  img = Image.open(screenshots[0])
757
  img_width, img_height = img.size
 
758
  c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
759
 
760
  for screenshot in screenshots:
761
+ # Check if file exists and has content
762
+ if os.path.exists(screenshot) and os.path.getsize(screenshot) > 0:
763
+ img = Image.open(screenshot)
764
+ c.drawImage(screenshot, 0, 0, img_width, img_height)
765
+ c.showPage()
766
 
767
  c.save()
768
+
769
+ # Clean up screenshots
770
+ for screenshot in screenshots:
771
+ if os.path.exists(screenshot):
772
  os.remove(screenshot)
773
+ os.rmdir(temp_dir)
774
+
775
+ # Verify the PDF was created successfully
776
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
777
+ logger.info(f"Successfully created PDF with {len(screenshots)} pages")
778
+ return save_path
779
+ else:
780
+ logger.error("Failed to create PDF from screenshots")
781
+ return None
782
  else:
783
+ # For non-PDF files: take a single screenshot
784
+ logger.info("Non-PDF file detected, taking single screenshot")
785
+ screenshot_path = os.path.join(tempfile.gettempdir(), "screenshot.png")
786
  await page.screenshot(path=screenshot_path, full_page=True)
787
 
788
+ # Convert to requested format if needed
789
+ if save_path.lower().endswith('.pdf'):
 
 
790
  # Convert to PDF
791
  from PIL import Image
792
  from reportlab.pdfgen import canvas as pdf_canvas
 
801
 
802
  # Clean up
803
  os.remove(screenshot_path)
 
804
 
805
  # Close browser
806
  await browser.close()
807
 
808
  # Verify file exists and is not empty
809
  if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
810
+ logger.info(f"Successfully downloaded file to {save_path}")
811
  return save_path
812
  else:
813
  logger.error(f"Failed to create valid file at {save_path}")