Update app.py
Browse files
app.py
CHANGED
@@ -442,7 +442,7 @@ class DownloadManager:
|
|
442 |
return None
|
443 |
|
444 |
async def force_download_viewonly(self, file_info, save_path):
|
445 |
-
"""Last-resort method to download view-only Google Drive files"""
|
446 |
try:
|
447 |
# Extract file ID from URL
|
448 |
file_id = None
|
@@ -469,11 +469,12 @@ class DownloadManager:
|
|
469 |
# Launch a new browser context with higher resolution
|
470 |
browser = await self.playwright.chromium.launch(
|
471 |
headless=True,
|
472 |
-
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
473 |
)
|
474 |
|
|
|
475 |
context = await browser.new_context(
|
476 |
-
viewport={'width':
|
477 |
user_agent=get_random_user_agent(),
|
478 |
device_scale_factor=2.0 # Higher resolution for better quality
|
479 |
)
|
@@ -482,135 +483,310 @@ class DownloadManager:
|
|
482 |
|
483 |
# Navigate to the file
|
484 |
try:
|
|
|
485 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
|
486 |
wait_until='networkidle',
|
487 |
-
timeout=
|
488 |
|
489 |
# Wait for content to load fully
|
490 |
await page.wait_for_timeout(5000)
|
491 |
|
492 |
-
#
|
493 |
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
494 |
|
495 |
if is_pdf:
|
496 |
-
# For PDFs:
|
497 |
-
logger.info("Detected PDF, using
|
498 |
|
499 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
await page.evaluate("""
|
501 |
-
async function
|
502 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
503 |
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
504 |
if (!container) return;
|
505 |
|
506 |
-
//
|
|
|
507 |
container.scrollTo(0, container.scrollHeight);
|
508 |
-
await delay(
|
509 |
|
510 |
-
//
|
511 |
container.scrollTo(0, 0);
|
512 |
await delay(1000);
|
513 |
}
|
514 |
-
return
|
515 |
""")
|
516 |
|
517 |
-
# Count pages
|
518 |
page_count = await page.evaluate("""
|
519 |
() => {
|
|
|
520 |
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
521 |
-
return pages.length;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
}
|
523 |
""")
|
524 |
|
|
|
525 |
if page_count == 0:
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
538 |
-
c.drawImage(screenshot_path, 0, 0, width, height)
|
539 |
-
c.save()
|
540 |
-
|
541 |
-
# Clean up
|
542 |
-
os.remove(screenshot_path)
|
543 |
-
os.rmdir(temp_dir)
|
544 |
-
else:
|
545 |
-
# Create temp directory for page screenshots
|
546 |
-
temp_dir = tempfile.mkdtemp()
|
547 |
-
screenshots = []
|
548 |
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
await delay(500);
|
|
|
559 |
}}
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
const
|
570 |
-
const
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
}}
|
576 |
}}
|
577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
|
579 |
-
|
580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
581 |
|
582 |
-
#
|
583 |
-
|
584 |
-
|
585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
586 |
|
587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
588 |
img = Image.open(screenshots[0])
|
589 |
img_width, img_height = img.size
|
590 |
-
|
591 |
c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
|
592 |
|
593 |
for screenshot in screenshots:
|
594 |
-
|
595 |
-
|
596 |
-
|
|
|
|
|
597 |
|
598 |
c.save()
|
599 |
-
|
600 |
-
|
601 |
-
|
|
|
602 |
os.remove(screenshot)
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
else:
|
605 |
-
# For
|
606 |
-
|
607 |
-
screenshot_path = os.path.join(
|
608 |
await page.screenshot(path=screenshot_path, full_page=True)
|
609 |
|
610 |
-
#
|
611 |
-
|
612 |
-
|
613 |
-
if ext.lower() == '.pdf':
|
614 |
# Convert to PDF
|
615 |
from PIL import Image
|
616 |
from reportlab.pdfgen import canvas as pdf_canvas
|
@@ -625,14 +801,13 @@ class DownloadManager:
|
|
625 |
|
626 |
# Clean up
|
627 |
os.remove(screenshot_path)
|
628 |
-
os.rmdir(temp_dir)
|
629 |
|
630 |
# Close browser
|
631 |
await browser.close()
|
632 |
|
633 |
# Verify file exists and is not empty
|
634 |
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
635 |
-
logger.info(f"Successfully downloaded
|
636 |
return save_path
|
637 |
else:
|
638 |
logger.error(f"Failed to create valid file at {save_path}")
|
|
|
442 |
return None
|
443 |
|
444 |
async def force_download_viewonly(self, file_info, save_path):
|
445 |
+
"""Last-resort method to download view-only Google Drive files - improved for multi-page PDFs"""
|
446 |
try:
|
447 |
# Extract file ID from URL
|
448 |
file_id = None
|
|
|
469 |
# Launch a new browser context with higher resolution
|
470 |
browser = await self.playwright.chromium.launch(
|
471 |
headless=True,
|
472 |
+
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security']
|
473 |
)
|
474 |
|
475 |
+
# Use a larger viewport for better quality
|
476 |
context = await browser.new_context(
|
477 |
+
viewport={'width': 1920, 'height': 1080},
|
478 |
user_agent=get_random_user_agent(),
|
479 |
device_scale_factor=2.0 # Higher resolution for better quality
|
480 |
)
|
|
|
483 |
|
484 |
# Navigate to the file
|
485 |
try:
|
486 |
+
logger.info(f"Opening view-only file: https://drive.google.com/file/d/{file_id}/view")
|
487 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
|
488 |
wait_until='networkidle',
|
489 |
+
timeout=90000) # Longer timeout for large PDFs
|
490 |
|
491 |
# Wait for content to load fully
|
492 |
await page.wait_for_timeout(5000)
|
493 |
|
494 |
+
# Detect if it's a PDF
|
495 |
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
496 |
|
497 |
if is_pdf:
|
498 |
+
# For PDFs: Multi-page capture approach
|
499 |
+
logger.info("Detected PDF, using multi-page capture approach")
|
500 |
|
501 |
+
# First, try to find the viewer container
|
502 |
+
viewer_container = await page.query_selector('.drive-viewer-paginated-scrollable')
|
503 |
+
|
504 |
+
if not viewer_container:
|
505 |
+
logger.warning("Could not find standard PDF viewer container, trying alternatives")
|
506 |
+
viewer_container = await page.query_selector('.drive-viewer-content') or \
|
507 |
+
await page.query_selector('#drive-pdf-viewer') or \
|
508 |
+
await page.query_selector('.drive-viewer')
|
509 |
+
|
510 |
+
if not viewer_container:
|
511 |
+
# Take a single screenshot as fallback
|
512 |
+
logger.warning("Could not find any PDF viewer container, using fallback")
|
513 |
+
screenshot_path = os.path.join(tempfile.gettempdir(), "gdrive_pdf_fallback.png")
|
514 |
+
await page.screenshot(path=screenshot_path, full_page=True)
|
515 |
+
|
516 |
+
# Convert to PDF
|
517 |
+
from PIL import Image
|
518 |
+
from reportlab.pdfgen import canvas as pdf_canvas
|
519 |
+
img = Image.open(screenshot_path)
|
520 |
+
width, height = img.size
|
521 |
+
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
522 |
+
c.drawImage(screenshot_path, 0, 0, width, height)
|
523 |
+
c.save()
|
524 |
+
os.remove(screenshot_path)
|
525 |
+
return save_path
|
526 |
+
|
527 |
+
# Scroll through to load all pages first
|
528 |
+
logger.info("Pre-loading all PDF pages...")
|
529 |
await page.evaluate("""
|
530 |
+
async function preloadAllPages() {
|
531 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
532 |
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
533 |
if (!container) return;
|
534 |
|
535 |
+
// Scroll to bottom to force all pages to load
|
536 |
+
const initialScroll = container.scrollTop;
|
537 |
container.scrollTo(0, container.scrollHeight);
|
538 |
+
await delay(3000); // Wait for loading
|
539 |
|
540 |
+
// Scroll back to top
|
541 |
container.scrollTo(0, 0);
|
542 |
await delay(1000);
|
543 |
}
|
544 |
+
return preloadAllPages();
|
545 |
""")
|
546 |
|
547 |
+
# Count visible pages - critical step that needs to be fixed
|
548 |
page_count = await page.evaluate("""
|
549 |
() => {
|
550 |
+
// Try multiple selectors for pages
|
551 |
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
552 |
+
if (pages.length > 0) return pages.length;
|
553 |
+
|
554 |
+
// Alternative selectors if standard one fails
|
555 |
+
const altPages = document.querySelectorAll('.drive-viewer-page');
|
556 |
+
if (altPages.length > 0) return altPages.length;
|
557 |
+
|
558 |
+
// Try to find page numbers in navigation
|
559 |
+
const pageNav = document.querySelector('.drive-viewer-paginated-counter');
|
560 |
+
if (pageNav) {
|
561 |
+
const text = pageNav.textContent || '';
|
562 |
+
const match = text.match(/(\d+)\s*\/\s*(\d+)/);
|
563 |
+
if (match && match[2]) return parseInt(match[2]);
|
564 |
+
}
|
565 |
+
|
566 |
+
return 0; // Fallback
|
567 |
}
|
568 |
""")
|
569 |
|
570 |
+
# If no pages found but we know it's a PDF, manually check for page counter
|
571 |
if page_count == 0:
|
572 |
+
# Try to find the page counter text and extract total pages
|
573 |
+
page_counter_text = await page.evaluate("""
|
574 |
+
() => {
|
575 |
+
const elements = Array.from(document.querySelectorAll('*'));
|
576 |
+
for (const el of elements) {
|
577 |
+
const text = el.textContent || '';
|
578 |
+
if (text.match(/\d+\s*\/\s*\d+/)) return text;
|
579 |
+
}
|
580 |
+
return '';
|
581 |
+
}
|
582 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
+
if page_counter_text:
|
585 |
+
match = re.search(r'(\d+)\s*\/\s*(\d+)', page_counter_text)
|
586 |
+
if match and match.group(2):
|
587 |
+
page_count = int(match.group(2))
|
588 |
+
logger.info(f"Detected {page_count} pages from page counter")
|
589 |
+
|
590 |
+
# If we still have no page count, default to a reasonable number
|
591 |
+
if page_count == 0:
|
592 |
+
logger.warning("Could not detect page count, defaulting to 50 pages to be safe")
|
593 |
+
page_count = 50 # Try to capture up to 50 pages by default
|
594 |
+
|
595 |
+
logger.info(f"Found {page_count} pages in PDF")
|
596 |
+
|
597 |
+
# Create a temporary directory for screenshots
|
598 |
+
temp_dir = tempfile.mkdtemp()
|
599 |
+
screenshots = []
|
600 |
+
|
601 |
+
# Function to scroll to a specific page and take a screenshot
|
602 |
+
async def capture_page(page_num):
|
603 |
+
# Scroll to the page
|
604 |
+
success = await page.evaluate(f"""
|
605 |
+
async function scrollToPage(pageNum) {{
|
606 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
607 |
+
|
608 |
+
// Try multiple selectors for pages
|
609 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
610 |
+
if (pages.length > 0 && pageNum < pages.length) {{
|
611 |
+
pages[pageNum].scrollIntoView({{behavior: 'instant', block: 'center'}});
|
612 |
await delay(500);
|
613 |
+
return true;
|
614 |
}}
|
615 |
+
|
616 |
+
// Alternative: try to use page navigation buttons
|
617 |
+
const pageInput = document.querySelector('input[aria-label="Page"]');
|
618 |
+
if (pageInput) {{
|
619 |
+
// Set page number in input
|
620 |
+
const nativeInputValueSetter = Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, "value").set;
|
621 |
+
nativeInputValueSetter.call(pageInput, {page_num + 1});
|
622 |
+
|
623 |
+
// Dispatch events
|
624 |
+
const ev1 = new Event('input', {{ bubbles: true }});
|
625 |
+
const ev2 = new Event('change', {{ bubbles: true }});
|
626 |
+
pageInput.dispatchEvent(ev1);
|
627 |
+
pageInput.dispatchEvent(ev2);
|
628 |
+
|
629 |
+
// Press Enter to navigate
|
630 |
+
const keyEvent = new KeyboardEvent('keydown', {{
|
631 |
+
key: 'Enter',
|
632 |
+
code: 'Enter',
|
633 |
+
keyCode: 13,
|
634 |
+
which: 13,
|
635 |
+
bubbles: true
|
636 |
+
}});
|
637 |
+
pageInput.dispatchEvent(keyEvent);
|
638 |
+
|
639 |
+
await delay(1000); // Wait for navigation
|
640 |
+
return true;
|
641 |
+
}}
|
642 |
+
|
643 |
+
// Alternative: use page selector dropdown if available
|
644 |
+
const pageSelector = document.querySelector('.drive-viewer-paginated-page-selector');
|
645 |
+
if (pageSelector) {{
|
646 |
+
pageSelector.click();
|
647 |
+
await delay(300);
|
648 |
+
|
649 |
+
// Find and click the specific page option
|
650 |
+
const options = document.querySelectorAll('.drive-viewer-paginated-page-selector-option');
|
651 |
+
if (options.length > pageNum) {{
|
652 |
+
options[pageNum].click();
|
653 |
+
await delay(1000);
|
654 |
+
return true;
|
655 |
}}
|
656 |
}}
|
657 |
+
|
658 |
+
return false;
|
659 |
+
}}
|
660 |
+
return scrollToPage({page_num});
|
661 |
+
""")
|
662 |
+
|
663 |
+
if not success:
|
664 |
+
# Alternative: Try using the page navigation buttons
|
665 |
+
logger.info(f"Using alternative navigation for page {page_num + 1}")
|
666 |
|
667 |
+
# Find navigation buttons
|
668 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
669 |
+
prev_button = await page.query_selector('button[aria-label="Previous page"]')
|
670 |
+
|
671 |
+
# If we're not on the first page, go back to first page
|
672 |
+
if page_num == 0 and prev_button:
|
673 |
+
for _ in range(50): # Limit to avoid infinite loop
|
674 |
+
is_disabled = await prev_button.get_attribute('disabled')
|
675 |
+
if is_disabled:
|
676 |
+
break
|
677 |
+
await prev_button.click()
|
678 |
+
await page.wait_for_timeout(300)
|
679 |
+
|
680 |
+
# Now navigate forward to desired page
|
681 |
+
if page_num > 0 and next_button:
|
682 |
+
for _ in range(page_num):
|
683 |
+
await next_button.click()
|
684 |
+
await page.wait_for_timeout(500)
|
685 |
+
|
686 |
+
# Wait for the page content to load
|
687 |
+
await page.wait_for_timeout(1000)
|
688 |
|
689 |
+
# Wait for page to stabilize
|
690 |
+
await page.wait_for_timeout(500)
|
691 |
+
|
692 |
+
# Take the screenshot
|
693 |
+
screenshot_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
|
694 |
+
|
695 |
+
# Determine what to screenshot based on the viewer
|
696 |
+
current_page_element = await page.evaluate("""
|
697 |
+
() => {
|
698 |
+
// First try getting the current visible page
|
699 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
700 |
+
for (const page of pages) {
|
701 |
+
const rect = page.getBoundingClientRect();
|
702 |
+
if (rect.top < window.innerHeight && rect.bottom > 0) {
|
703 |
+
return {
|
704 |
+
x: Math.max(0, rect.left),
|
705 |
+
y: Math.max(0, rect.top),
|
706 |
+
width: Math.min(window.innerWidth, rect.width),
|
707 |
+
height: Math.min(window.innerHeight, rect.bottom - rect.top)
|
708 |
+
};
|
709 |
+
}
|
710 |
+
}
|
711 |
+
|
712 |
+
// Fallback: try to find the container
|
713 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
714 |
+
if (container) {
|
715 |
+
const rect = container.getBoundingClientRect();
|
716 |
+
return {
|
717 |
+
x: Math.max(0, rect.left),
|
718 |
+
y: Math.max(0, rect.top),
|
719 |
+
width: Math.min(window.innerWidth, rect.width),
|
720 |
+
height: Math.min(window.innerHeight, rect.bottom - rect.top)
|
721 |
+
};
|
722 |
+
}
|
723 |
+
|
724 |
+
// Last resort: screenshot the visible area
|
725 |
+
return null;
|
726 |
+
}
|
727 |
+
""")
|
728 |
|
729 |
+
if current_page_element:
|
730 |
+
# Screenshot the specific page element
|
731 |
+
await page.screenshot(path=screenshot_path, clip=current_page_element)
|
732 |
+
else:
|
733 |
+
# Screenshot the entire visible area
|
734 |
+
await page.screenshot(path=screenshot_path)
|
735 |
+
|
736 |
+
return screenshot_path
|
737 |
+
|
738 |
+
# Capture all pages
|
739 |
+
for i in range(page_count):
|
740 |
+
logger.info(f"Capturing page {i+1} of {page_count}")
|
741 |
+
screenshot_path = await capture_page(i)
|
742 |
+
screenshots.append(screenshot_path)
|
743 |
+
# Add progress indicator
|
744 |
+
if (i+1) % 5 == 0 or i+1 == page_count:
|
745 |
+
logger.info(f"Progress: {i+1}/{page_count} pages captured")
|
746 |
+
|
747 |
+
# Combine screenshots into a PDF
|
748 |
+
from PIL import Image
|
749 |
+
from reportlab.lib.pagesizes import letter
|
750 |
+
from reportlab.pdfgen import canvas as pdf_canvas
|
751 |
+
|
752 |
+
logger.info(f"Combining {len(screenshots)} screenshots into PDF")
|
753 |
+
|
754 |
+
# Use the first image dimensions to set PDF size if available
|
755 |
+
if screenshots:
|
756 |
img = Image.open(screenshots[0])
|
757 |
img_width, img_height = img.size
|
|
|
758 |
c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
|
759 |
|
760 |
for screenshot in screenshots:
|
761 |
+
# Check if file exists and has content
|
762 |
+
if os.path.exists(screenshot) and os.path.getsize(screenshot) > 0:
|
763 |
+
img = Image.open(screenshot)
|
764 |
+
c.drawImage(screenshot, 0, 0, img_width, img_height)
|
765 |
+
c.showPage()
|
766 |
|
767 |
c.save()
|
768 |
+
|
769 |
+
# Clean up screenshots
|
770 |
+
for screenshot in screenshots:
|
771 |
+
if os.path.exists(screenshot):
|
772 |
os.remove(screenshot)
|
773 |
+
os.rmdir(temp_dir)
|
774 |
+
|
775 |
+
# Verify the PDF was created successfully
|
776 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
777 |
+
logger.info(f"Successfully created PDF with {len(screenshots)} pages")
|
778 |
+
return save_path
|
779 |
+
else:
|
780 |
+
logger.error("Failed to create PDF from screenshots")
|
781 |
+
return None
|
782 |
else:
|
783 |
+
# For non-PDF files: take a single screenshot
|
784 |
+
logger.info("Non-PDF file detected, taking single screenshot")
|
785 |
+
screenshot_path = os.path.join(tempfile.gettempdir(), "screenshot.png")
|
786 |
await page.screenshot(path=screenshot_path, full_page=True)
|
787 |
|
788 |
+
# Convert to requested format if needed
|
789 |
+
if save_path.lower().endswith('.pdf'):
|
|
|
|
|
790 |
# Convert to PDF
|
791 |
from PIL import Image
|
792 |
from reportlab.pdfgen import canvas as pdf_canvas
|
|
|
801 |
|
802 |
# Clean up
|
803 |
os.remove(screenshot_path)
|
|
|
804 |
|
805 |
# Close browser
|
806 |
await browser.close()
|
807 |
|
808 |
# Verify file exists and is not empty
|
809 |
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
810 |
+
logger.info(f"Successfully downloaded file to {save_path}")
|
811 |
return save_path
|
812 |
else:
|
813 |
logger.error(f"Failed to create valid file at {save_path}")
|