Update app.py
Browse files
app.py
CHANGED
@@ -442,377 +442,326 @@ class DownloadManager:
|
|
442 |
return None
|
443 |
|
444 |
async def force_download_viewonly(self, file_info, save_path):
|
445 |
-
"""
|
446 |
try:
|
447 |
-
# Extract file ID
|
448 |
-
file_id =
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
|
|
455 |
|
456 |
if not file_id:
|
457 |
logger.error("Could not extract file ID")
|
458 |
return None
|
459 |
|
460 |
-
|
461 |
-
|
462 |
-
# Make sure we have the proper file extension
|
463 |
base, ext = os.path.splitext(save_path)
|
464 |
if not ext:
|
465 |
-
# Determine file type from metadata or set default to PDF
|
466 |
-
file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
|
467 |
save_path = f"{base}.{file_type}"
|
468 |
|
469 |
-
|
|
|
|
|
470 |
browser = await self.playwright.chromium.launch(
|
471 |
headless=True,
|
472 |
-
args=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
)
|
474 |
|
475 |
-
# Use
|
476 |
context = await browser.new_context(
|
477 |
-
viewport={'width':
|
478 |
-
user_agent=
|
479 |
-
device_scale_factor=2.0
|
480 |
)
|
481 |
|
482 |
page = await context.new_page()
|
483 |
|
484 |
-
# Navigate to the file
|
485 |
try:
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
# Detect if it's a PDF
|
495 |
-
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
496 |
|
497 |
-
|
498 |
-
|
499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
|
501 |
-
|
502 |
-
viewer_container = await page.query_selector('.drive-viewer-paginated-scrollable')
|
503 |
|
504 |
-
if
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
|
|
|
|
|
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
# Convert to PDF
|
517 |
from PIL import Image
|
518 |
from reportlab.pdfgen import canvas as pdf_canvas
|
|
|
519 |
img = Image.open(screenshot_path)
|
520 |
width, height = img.size
|
521 |
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
522 |
c.drawImage(screenshot_path, 0, 0, width, height)
|
523 |
c.save()
|
|
|
524 |
os.remove(screenshot_path)
|
525 |
-
|
|
|
|
|
|
|
|
|
526 |
|
527 |
-
#
|
528 |
-
logger.info("
|
529 |
-
await page.evaluate("""
|
530 |
-
async function preloadAllPages() {
|
531 |
-
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
532 |
-
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
533 |
-
if (!container) return;
|
534 |
-
|
535 |
-
// Scroll to bottom to force all pages to load
|
536 |
-
const initialScroll = container.scrollTop;
|
537 |
-
container.scrollTo(0, container.scrollHeight);
|
538 |
-
await delay(3000); // Wait for loading
|
539 |
-
|
540 |
-
// Scroll back to top
|
541 |
-
container.scrollTo(0, 0);
|
542 |
-
await delay(1000);
|
543 |
-
}
|
544 |
-
return preloadAllPages();
|
545 |
-
""")
|
546 |
|
547 |
-
#
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
// Try to find page numbers in navigation
|
559 |
-
const pageNav = document.querySelector('.drive-viewer-paginated-counter');
|
560 |
-
if (pageNav) {
|
561 |
-
const text = pageNav.textContent || '';
|
562 |
-
const match = text.match(/(\d+)\s*\/\s*(\d+)/);
|
563 |
-
if (match && match[2]) return parseInt(match[2]);
|
564 |
-
}
|
565 |
-
|
566 |
-
return 0; // Fallback
|
567 |
}
|
|
|
|
|
568 |
""")
|
569 |
|
570 |
-
|
571 |
-
if
|
572 |
-
|
573 |
-
|
574 |
-
()
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
}
|
580 |
-
return '';
|
581 |
-
}
|
582 |
-
""")
|
583 |
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
589 |
|
590 |
-
#
|
591 |
-
|
592 |
-
|
593 |
-
|
|
|
594 |
|
595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
|
597 |
-
#
|
598 |
-
temp_dir = tempfile.mkdtemp()
|
599 |
-
screenshots = []
|
600 |
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
success = await page.evaluate(f"""
|
605 |
-
async function scrollToPage(pageNum) {{
|
606 |
-
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
607 |
-
|
608 |
-
// Try multiple selectors for pages
|
609 |
-
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
610 |
-
if (pages.length > 0 && pageNum < pages.length) {{
|
611 |
-
pages[pageNum].scrollIntoView({{behavior: 'instant', block: 'center'}});
|
612 |
-
await delay(500);
|
613 |
-
return true;
|
614 |
-
}}
|
615 |
-
|
616 |
-
// Alternative: try to use page navigation buttons
|
617 |
-
const pageInput = document.querySelector('input[aria-label="Page"]');
|
618 |
-
if (pageInput) {{
|
619 |
-
// Set page number in input
|
620 |
-
const nativeInputValueSetter = Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, "value").set;
|
621 |
-
nativeInputValueSetter.call(pageInput, {page_num + 1});
|
622 |
-
|
623 |
-
// Dispatch events
|
624 |
-
const ev1 = new Event('input', {{ bubbles: true }});
|
625 |
-
const ev2 = new Event('change', {{ bubbles: true }});
|
626 |
-
pageInput.dispatchEvent(ev1);
|
627 |
-
pageInput.dispatchEvent(ev2);
|
628 |
-
|
629 |
-
// Press Enter to navigate
|
630 |
-
const keyEvent = new KeyboardEvent('keydown', {{
|
631 |
-
key: 'Enter',
|
632 |
-
code: 'Enter',
|
633 |
-
keyCode: 13,
|
634 |
-
which: 13,
|
635 |
-
bubbles: true
|
636 |
-
}});
|
637 |
-
pageInput.dispatchEvent(keyEvent);
|
638 |
-
|
639 |
-
await delay(1000); // Wait for navigation
|
640 |
-
return true;
|
641 |
-
}}
|
642 |
-
|
643 |
-
// Alternative: use page selector dropdown if available
|
644 |
-
const pageSelector = document.querySelector('.drive-viewer-paginated-page-selector');
|
645 |
-
if (pageSelector) {{
|
646 |
-
pageSelector.click();
|
647 |
-
await delay(300);
|
648 |
-
|
649 |
-
// Find and click the specific page option
|
650 |
-
const options = document.querySelectorAll('.drive-viewer-paginated-page-selector-option');
|
651 |
-
if (options.length > pageNum) {{
|
652 |
-
options[pageNum].click();
|
653 |
-
await delay(1000);
|
654 |
-
return true;
|
655 |
-
}}
|
656 |
-
}}
|
657 |
-
|
658 |
-
return false;
|
659 |
-
}}
|
660 |
-
return scrollToPage({page_num});
|
661 |
-
""")
|
662 |
-
|
663 |
-
if not success:
|
664 |
-
# Alternative: Try using the page navigation buttons
|
665 |
-
logger.info(f"Using alternative navigation for page {page_num + 1}")
|
666 |
-
|
667 |
-
# Find navigation buttons
|
668 |
-
next_button = await page.query_selector('button[aria-label="Next page"]')
|
669 |
-
prev_button = await page.query_selector('button[aria-label="Previous page"]')
|
670 |
-
|
671 |
-
# If we're not on the first page, go back to first page
|
672 |
-
if page_num == 0 and prev_button:
|
673 |
-
for _ in range(50): # Limit to avoid infinite loop
|
674 |
-
is_disabled = await prev_button.get_attribute('disabled')
|
675 |
-
if is_disabled:
|
676 |
-
break
|
677 |
-
await prev_button.click()
|
678 |
-
await page.wait_for_timeout(300)
|
679 |
-
|
680 |
-
# Now navigate forward to desired page
|
681 |
-
if page_num > 0 and next_button:
|
682 |
-
for _ in range(page_num):
|
683 |
-
await next_button.click()
|
684 |
-
await page.wait_for_timeout(500)
|
685 |
-
|
686 |
-
# Wait for the page content to load
|
687 |
-
await page.wait_for_timeout(1000)
|
688 |
|
689 |
-
#
|
690 |
-
|
691 |
|
692 |
-
#
|
693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
694 |
|
695 |
-
|
696 |
-
|
697 |
-
() => {
|
698 |
-
// First try getting the current visible page
|
699 |
-
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
700 |
-
for (const page of pages) {
|
701 |
-
const rect = page.getBoundingClientRect();
|
702 |
-
if (rect.top < window.innerHeight && rect.bottom > 0) {
|
703 |
-
return {
|
704 |
-
x: Math.max(0, rect.left),
|
705 |
-
y: Math.max(0, rect.top),
|
706 |
-
width: Math.min(window.innerWidth, rect.width),
|
707 |
-
height: Math.min(window.innerHeight, rect.bottom - rect.top)
|
708 |
-
};
|
709 |
-
}
|
710 |
-
}
|
711 |
-
|
712 |
-
// Fallback: try to find the container
|
713 |
-
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
714 |
-
if (container) {
|
715 |
-
const rect = container.getBoundingClientRect();
|
716 |
-
return {
|
717 |
-
x: Math.max(0, rect.left),
|
718 |
-
y: Math.max(0, rect.top),
|
719 |
-
width: Math.min(window.innerWidth, rect.width),
|
720 |
-
height: Math.min(window.innerHeight, rect.bottom - rect.top)
|
721 |
-
};
|
722 |
-
}
|
723 |
-
|
724 |
-
// Last resort: screenshot the visible area
|
725 |
-
return null;
|
726 |
-
}
|
727 |
-
""")
|
728 |
|
729 |
-
if
|
730 |
-
|
731 |
-
await
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
732 |
else:
|
733 |
-
#
|
734 |
-
await page.
|
|
|
|
|
|
|
735 |
|
736 |
-
|
|
|
|
|
|
|
737 |
|
738 |
-
#
|
739 |
-
|
740 |
-
logger.info(f"Capturing page {i+1} of {page_count}")
|
741 |
-
screenshot_path = await capture_page(i)
|
742 |
-
screenshots.append(screenshot_path)
|
743 |
-
# Add progress indicator
|
744 |
-
if (i+1) % 5 == 0 or i+1 == page_count:
|
745 |
-
logger.info(f"Progress: {i+1}/{page_count} pages captured")
|
746 |
|
747 |
-
# Combine screenshots into a PDF
|
748 |
from PIL import Image
|
749 |
from reportlab.lib.pagesizes import letter
|
750 |
from reportlab.pdfgen import canvas as pdf_canvas
|
751 |
|
752 |
-
|
753 |
-
|
754 |
-
# Use the first image dimensions to set PDF size if available
|
755 |
if screenshots:
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
-
|
773 |
-
|
774 |
-
|
775 |
-
|
776 |
-
|
777 |
-
|
778 |
-
|
|
|
|
|
779 |
else:
|
780 |
-
logger.error("
|
781 |
-
return None
|
782 |
else:
|
783 |
-
#
|
784 |
-
|
785 |
-
|
786 |
-
await page.screenshot(path=screenshot_path, full_page=True)
|
787 |
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
from PIL import Image
|
792 |
-
from reportlab.pdfgen import canvas as pdf_canvas
|
793 |
-
img = Image.open(screenshot_path)
|
794 |
-
width, height = img.size
|
795 |
-
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
796 |
-
c.drawImage(screenshot_path, 0, 0, width, height)
|
797 |
-
c.save()
|
798 |
else:
|
799 |
-
#
|
800 |
shutil.copy(screenshot_path, save_path)
|
801 |
|
802 |
-
# Clean up
|
803 |
os.remove(screenshot_path)
|
804 |
|
|
|
|
|
|
|
|
|
|
|
|
|
805 |
# Close browser
|
806 |
await browser.close()
|
807 |
|
808 |
-
# Verify file exists and
|
809 |
-
if os.path.exists(save_path) and os.path.getsize(save_path) >
|
810 |
logger.info(f"Successfully downloaded file to {save_path}")
|
811 |
return save_path
|
812 |
else:
|
813 |
-
logger.error(f"
|
814 |
return None
|
815 |
-
|
816 |
except Exception as e:
|
817 |
logger.error(f"Error during force download: {e}")
|
818 |
if browser:
|
@@ -820,7 +769,7 @@ class DownloadManager:
|
|
820 |
return None
|
821 |
|
822 |
except Exception as e:
|
823 |
-
logger.error(f"Force download failed: {e}")
|
824 |
return None
|
825 |
|
826 |
async def download_from_google_drive(self, url, save_path):
|
|
|
442 |
return None
|
443 |
|
444 |
async def force_download_viewonly(self, file_info, save_path):
|
445 |
+
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
|
446 |
try:
|
447 |
+
# Extract file ID
|
448 |
+
file_id = file_info.get('metadata', {}).get('file_id')
|
449 |
+
if not file_id:
|
450 |
+
url = file_info['url']
|
451 |
+
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
452 |
+
match = re.search(pattern, url)
|
453 |
+
if match:
|
454 |
+
file_id = match.group(1)
|
455 |
+
break
|
456 |
|
457 |
if not file_id:
|
458 |
logger.error("Could not extract file ID")
|
459 |
return None
|
460 |
|
461 |
+
file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
|
|
|
|
|
462 |
base, ext = os.path.splitext(save_path)
|
463 |
if not ext:
|
|
|
|
|
464 |
save_path = f"{base}.{file_type}"
|
465 |
|
466 |
+
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
|
467 |
+
|
468 |
+
# Create a dedicated browser instance with better resolution
|
469 |
browser = await self.playwright.chromium.launch(
|
470 |
headless=True,
|
471 |
+
args=[
|
472 |
+
'--no-sandbox',
|
473 |
+
'--disable-setuid-sandbox',
|
474 |
+
'--disable-dev-shm-usage',
|
475 |
+
'--disable-web-security',
|
476 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
477 |
+
'--disable-site-isolation-trials'
|
478 |
+
]
|
479 |
)
|
480 |
|
481 |
+
# Use higher resolution for better quality
|
482 |
context = await browser.new_context(
|
483 |
+
viewport={'width': 1600, 'height': 1200},
|
484 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
485 |
+
device_scale_factor=2.0
|
486 |
)
|
487 |
|
488 |
page = await context.new_page()
|
489 |
|
|
|
490 |
try:
|
491 |
+
# Go to the file view page
|
492 |
+
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
|
493 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
|
494 |
+
await page.wait_for_load_state('networkidle')
|
495 |
+
await page.wait_for_timeout(5000) # Wait longer for everything to load
|
496 |
+
|
497 |
+
# Create temp directory
|
498 |
+
temp_dir = tempfile.mkdtemp()
|
|
|
|
|
499 |
|
500 |
+
# Special handling for PDFs
|
501 |
+
if file_type.lower() == 'pdf':
|
502 |
+
# Check if there's a pagination control
|
503 |
+
pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
|
504 |
+
|
505 |
+
# Try multiple methods to extract total pages
|
506 |
+
total_pages = await page.evaluate("""
|
507 |
+
() => {
|
508 |
+
// Method 1: Check page counter text
|
509 |
+
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
510 |
+
const text = el.textContent || '';
|
511 |
+
return /\\d+\\s*\\/\\s*\\d+/.test(text);
|
512 |
+
});
|
513 |
+
|
514 |
+
if (pageCounters.length > 0) {
|
515 |
+
const text = pageCounters[0].textContent || '';
|
516 |
+
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
|
517 |
+
if (match && match[2]) return parseInt(match[2]);
|
518 |
+
}
|
519 |
+
|
520 |
+
// Method 2: Check actual page elements
|
521 |
+
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
|
522 |
+
if (pageElements.length > 0) return pageElements.length;
|
523 |
+
|
524 |
+
// Method 3: Look for page thumbnails
|
525 |
+
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
|
526 |
+
if (thumbnails.length > 0) return thumbnails.length;
|
527 |
+
|
528 |
+
// Fallback: conservative guess based on UI
|
529 |
+
return 50; // Safe default when we can't determine
|
530 |
+
}
|
531 |
+
""")
|
532 |
|
533 |
+
logger.info(f"Detected {total_pages} pages in PDF")
|
|
|
534 |
|
535 |
+
if total_pages <= 1:
|
536 |
+
# Additional check - sometimes the page count detection fails
|
537 |
+
# Let's double-check by looking for next/previous buttons
|
538 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
539 |
+
if next_button:
|
540 |
+
disabled = await next_button.get_attribute('disabled')
|
541 |
+
if not disabled:
|
542 |
+
logger.info("Found next button that's not disabled, document has multiple pages")
|
543 |
+
total_pages = 100 # Set a high number, we'll stop when we can't go further
|
544 |
|
545 |
+
# If we still think it's a single page, use a more direct approach
|
546 |
+
if total_pages <= 1:
|
547 |
+
# Single page approach
|
548 |
+
logger.info("Using single-page capture approach")
|
549 |
+
|
550 |
+
# Take a screenshot of the current view (should be the full document or first page)
|
551 |
+
screenshot_path = os.path.join(temp_dir, "page.png")
|
552 |
+
|
553 |
+
# Try to screenshot just the document area if we can find it
|
554 |
+
document_area = await page.query_selector('.drive-viewer-paginated-page')
|
555 |
+
if document_area:
|
556 |
+
await document_area.screenshot(path=screenshot_path)
|
557 |
+
else:
|
558 |
+
# Otherwise take a full screenshot
|
559 |
+
await page.screenshot(path=screenshot_path)
|
560 |
|
561 |
# Convert to PDF
|
562 |
from PIL import Image
|
563 |
from reportlab.pdfgen import canvas as pdf_canvas
|
564 |
+
|
565 |
img = Image.open(screenshot_path)
|
566 |
width, height = img.size
|
567 |
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
568 |
c.drawImage(screenshot_path, 0, 0, width, height)
|
569 |
c.save()
|
570 |
+
|
571 |
os.remove(screenshot_path)
|
572 |
+
os.rmdir(temp_dir)
|
573 |
+
|
574 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
575 |
+
return save_path
|
576 |
+
return None
|
577 |
|
578 |
+
# Multi-page approach
|
579 |
+
logger.info(f"Using multi-page capture approach for {total_pages} pages")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
|
581 |
+
# CRITICAL: We need to go to the first page first
|
582 |
+
# Check if we need to reset to first page
|
583 |
+
current_page_text = await page.evaluate("""
|
584 |
+
() => {
|
585 |
+
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
586 |
+
const text = el.textContent || '';
|
587 |
+
return /\\d+\\s*\\/\\s*\\d+/.test(text);
|
588 |
+
});
|
589 |
+
|
590 |
+
if (pageCounters.length > 0) {
|
591 |
+
return pageCounters[0].textContent || '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
}
|
593 |
+
return '';
|
594 |
+
}
|
595 |
""")
|
596 |
|
597 |
+
current_page = 1
|
598 |
+
if current_page_text:
|
599 |
+
match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
|
600 |
+
if match:
|
601 |
+
current_page = int(match.group(1))
|
602 |
+
|
603 |
+
# If we're not on page 1, go back to first page
|
604 |
+
if current_page > 1:
|
605 |
+
logger.info(f"Currently on page {current_page}, navigating back to page 1")
|
|
|
|
|
|
|
|
|
606 |
|
607 |
+
# Look for an input field where we can directly set the page number
|
608 |
+
page_input = await page.query_selector('input[aria-label="Page"]')
|
609 |
+
if page_input:
|
610 |
+
await page_input.fill("1")
|
611 |
+
await page_input.press("Enter")
|
612 |
+
await page.wait_for_timeout(1000)
|
613 |
+
else:
|
614 |
+
# Use prev button to go back to first page
|
615 |
+
prev_button = await page.query_selector('button[aria-label="Previous page"]')
|
616 |
+
if prev_button:
|
617 |
+
# Keep clicking until we can't anymore
|
618 |
+
for _ in range(current_page - 1):
|
619 |
+
try:
|
620 |
+
await prev_button.click()
|
621 |
+
await page.wait_for_timeout(500)
|
622 |
+
except Exception as e:
|
623 |
+
logger.warning(f"Error clicking prev button: {e}")
|
624 |
+
break
|
625 |
|
626 |
+
# Capture each page
|
627 |
+
screenshots = []
|
628 |
+
page_num = 1
|
629 |
+
max_tries = min(total_pages + 10, 200) # Set a reasonable limit
|
630 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
631 |
|
632 |
+
# Maximize the PDF view if possible
|
633 |
+
await page.evaluate("""
|
634 |
+
() => {
|
635 |
+
// Try to find and click any "full page" or "maximize" buttons
|
636 |
+
const fullViewButtons = Array.from(document.querySelectorAll('button'))
|
637 |
+
.filter(b => b.textContent?.includes('Full') ||
|
638 |
+
b.getAttribute('aria-label')?.includes('Full') ||
|
639 |
+
b.getAttribute('aria-label')?.includes('fit page'));
|
640 |
+
if (fullViewButtons.length > 0) {
|
641 |
+
fullViewButtons[0].click();
|
642 |
+
}
|
643 |
+
}
|
644 |
+
""")
|
645 |
|
646 |
+
await page.wait_for_timeout(1000) # Wait for view to adjust
|
|
|
|
|
647 |
|
648 |
+
while page_num <= max_tries:
|
649 |
+
# Wait for the page to be fully loaded
|
650 |
+
await page.wait_for_timeout(800)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
|
652 |
+
# Take a screenshot of the current page
|
653 |
+
screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
|
654 |
|
655 |
+
# Try different methods to identify and capture just the page content
|
656 |
+
page_content = await page.query_selector('.drive-viewer-paginated-page')
|
657 |
+
if page_content:
|
658 |
+
# Found the specific page element
|
659 |
+
await page_content.screenshot(path=screenshot_path)
|
660 |
+
else:
|
661 |
+
# Fall back to screenshot of visible viewport
|
662 |
+
await page.screenshot(path=screenshot_path)
|
663 |
|
664 |
+
screenshots.append(screenshot_path)
|
665 |
+
logger.info(f"Captured page {page_num}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
|
667 |
+
# Check if we have a disabled next button (reached the end)
|
668 |
+
if next_button:
|
669 |
+
is_disabled = await next_button.get_attribute('disabled')
|
670 |
+
if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
|
671 |
+
logger.info(f"Reached end of document after {page_num} pages")
|
672 |
+
break
|
673 |
+
|
674 |
+
# Click the next button
|
675 |
+
try:
|
676 |
+
await next_button.click()
|
677 |
+
await page.wait_for_timeout(800) # Wait for page transition
|
678 |
+
page_num += 1
|
679 |
+
except Exception as e:
|
680 |
+
logger.error(f"Error clicking next button: {e}")
|
681 |
+
# Try to get a fresh reference to the button
|
682 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
683 |
+
if not next_button:
|
684 |
+
logger.warning("Next button disappeared, assuming end of document")
|
685 |
+
break
|
686 |
else:
|
687 |
+
# Try to find the next button again
|
688 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
689 |
+
if not next_button:
|
690 |
+
logger.warning("Could not find next button, stopping navigation")
|
691 |
+
break
|
692 |
|
693 |
+
# Double-check if we've reached the expected total
|
694 |
+
if page_num >= total_pages:
|
695 |
+
logger.info(f"Reached expected total of {total_pages} pages")
|
696 |
+
break
|
697 |
|
698 |
+
# Combine screenshots into PDF
|
699 |
+
logger.info(f"Creating PDF from {len(screenshots)} captured pages")
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
|
|
|
701 |
from PIL import Image
|
702 |
from reportlab.lib.pagesizes import letter
|
703 |
from reportlab.pdfgen import canvas as pdf_canvas
|
704 |
|
705 |
+
# Use the size of the first screenshot to set PDF dimensions
|
|
|
|
|
706 |
if screenshots:
|
707 |
+
try:
|
708 |
+
img = Image.open(screenshots[0])
|
709 |
+
width, height = img.size
|
710 |
+
|
711 |
+
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
712 |
+
|
713 |
+
for screenshot in screenshots:
|
714 |
+
try:
|
715 |
+
if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
|
716 |
+
img = Image.open(screenshot)
|
717 |
+
c.drawImage(screenshot, 0, 0, width, height)
|
718 |
+
c.showPage()
|
719 |
+
except Exception as e:
|
720 |
+
logger.error(f"Error adding page to PDF: {e}")
|
721 |
+
|
722 |
+
c.save()
|
723 |
+
|
724 |
+
# Clean up screenshots
|
725 |
+
for screenshot in screenshots:
|
726 |
+
if os.path.exists(screenshot):
|
727 |
+
os.remove(screenshot)
|
728 |
+
|
729 |
+
logger.info(f"Successfully created PDF with {len(screenshots)} pages")
|
730 |
+
except Exception as e:
|
731 |
+
logger.error(f"Error creating PDF: {e}")
|
732 |
else:
|
733 |
+
logger.error("No screenshots captured to create PDF")
|
|
|
734 |
else:
|
735 |
+
# Non-PDF file handling
|
736 |
+
screenshot_path = os.path.join(temp_dir, "file.png")
|
737 |
+
await page.screenshot(path=screenshot_path)
|
|
|
738 |
|
739 |
+
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
|
740 |
+
# For document types, try to export directly
|
741 |
+
await self.export_google_doc(file_id, file_type, save_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
else:
|
743 |
+
# For other types, save the screenshot with appropriate extension
|
744 |
shutil.copy(screenshot_path, save_path)
|
745 |
|
|
|
746 |
os.remove(screenshot_path)
|
747 |
|
748 |
+
# Clean up temp directory
|
749 |
+
try:
|
750 |
+
os.rmdir(temp_dir)
|
751 |
+
except:
|
752 |
+
pass
|
753 |
+
|
754 |
# Close browser
|
755 |
await browser.close()
|
756 |
|
757 |
+
# Verify file exists and has content
|
758 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
|
759 |
logger.info(f"Successfully downloaded file to {save_path}")
|
760 |
return save_path
|
761 |
else:
|
762 |
+
logger.error(f"Generated file is too small or missing: {save_path}")
|
763 |
return None
|
764 |
+
|
765 |
except Exception as e:
|
766 |
logger.error(f"Error during force download: {e}")
|
767 |
if browser:
|
|
|
769 |
return None
|
770 |
|
771 |
except Exception as e:
|
772 |
+
logger.error(f"Force download preparation failed: {e}")
|
773 |
return None
|
774 |
|
775 |
async def download_from_google_drive(self, url, save_path):
|