Update app.py
Browse files
app.py
CHANGED
@@ -352,29 +352,26 @@ class DownloadManager:
|
|
352 |
file_id = match.group(1)
|
353 |
break
|
354 |
if file_id:
|
355 |
-
#
|
|
|
|
|
|
|
356 |
filename = f"gdrive_{file_id}"
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
'
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
found_files.append({
|
373 |
-
'url': href,
|
374 |
-
'filename': filename,
|
375 |
-
'size': "Unknown Size",
|
376 |
-
'metadata': {'file_id': file_id}
|
377 |
-
})
|
378 |
|
379 |
seen_urls = set()
|
380 |
unique_files = []
|
@@ -397,13 +394,33 @@ class DownloadManager:
|
|
397 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
398 |
counter += 1
|
399 |
os.makedirs(save_dir, exist_ok=True)
|
|
|
400 |
try:
|
401 |
# Special handling for Google Drive files
|
402 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
403 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
success = await self.download_from_google_drive(file_url, path)
|
405 |
-
|
|
|
406 |
|
|
|
|
|
|
|
|
|
|
|
407 |
# Original code for non-Google Drive downloads
|
408 |
async with self.context.new_page() as page:
|
409 |
headers = {
|
@@ -424,6 +441,213 @@ class DownloadManager:
|
|
424 |
logger.error(f"Error downloading {file_url}: {e}")
|
425 |
return None
|
426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
async def download_from_google_drive(self, url, save_path):
|
428 |
"""Enhanced method to download from Google Drive with multiple fallback approaches"""
|
429 |
# Extract the file ID from different URL formats
|
@@ -531,60 +755,7 @@ class DownloadManager:
|
|
531 |
except Exception as e:
|
532 |
logger.warning(f"Requests session download failed: {e}")
|
533 |
|
534 |
-
|
535 |
-
if is_view_only:
|
536 |
-
try:
|
537 |
-
# Try a direct headless browser download
|
538 |
-
async with self.context.new_page() as page:
|
539 |
-
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
|
540 |
-
|
541 |
-
# Try to capture the content directly from viewer
|
542 |
-
file_content = await page.evaluate("""
|
543 |
-
() => {
|
544 |
-
// Try to find the actual viewer content
|
545 |
-
const viewerContent = document.querySelector('.drive-viewer-paginated-content');
|
546 |
-
if (viewerContent) {
|
547 |
-
return viewerContent.innerHTML;
|
548 |
-
}
|
549 |
-
return document.documentElement.innerHTML;
|
550 |
-
}
|
551 |
-
""")
|
552 |
-
|
553 |
-
if file_content:
|
554 |
-
# Save as HTML and then we can convert it if needed
|
555 |
-
html_path = f"{base}.html"
|
556 |
-
with open(html_path, 'w', encoding='utf-8') as f:
|
557 |
-
f.write(f"""
|
558 |
-
<!DOCTYPE html>
|
559 |
-
<html>
|
560 |
-
<head><title>Google Drive Extracted Content</title></head>
|
561 |
-
<body>
|
562 |
-
{file_content}
|
563 |
-
</body>
|
564 |
-
</html>
|
565 |
-
""")
|
566 |
-
|
567 |
-
# If requested a PDF, convert HTML to PDF
|
568 |
-
if file_type == 'pdf' or ext.lower() == '.pdf':
|
569 |
-
try:
|
570 |
-
import pdfkit
|
571 |
-
pdfkit.from_file(html_path, save_path)
|
572 |
-
os.remove(html_path) # Clean up HTML file
|
573 |
-
return True
|
574 |
-
except Exception as pdf_err:
|
575 |
-
logger.warning(f"Error converting HTML to PDF: {pdf_err}")
|
576 |
-
# Keep the HTML file as fallback
|
577 |
-
shutil.copy(html_path, save_path)
|
578 |
-
return True
|
579 |
-
else:
|
580 |
-
# Just use the HTML file
|
581 |
-
shutil.copy(html_path, save_path)
|
582 |
-
return True
|
583 |
-
except Exception as e:
|
584 |
-
logger.warning(f"Final direct browser capture failed: {e}")
|
585 |
-
|
586 |
-
# All methods failed
|
587 |
-
logger.error(f"All download approaches failed for Google Drive file: {file_id}")
|
588 |
return False
|
589 |
|
590 |
async def get_google_drive_file_info(self, file_id):
|
|
|
352 |
file_id = match.group(1)
|
353 |
break
|
354 |
if file_id:
|
355 |
+
# Get file info to determine type and view-only status
|
356 |
+
file_type, is_view_only = await self.get_google_drive_file_info(file_id)
|
357 |
+
|
358 |
+
# Create a more informative filename based on info
|
359 |
filename = f"gdrive_{file_id}"
|
360 |
+
if file_type:
|
361 |
+
filename = f"{filename}.{file_type}"
|
362 |
+
|
363 |
+
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
|
364 |
+
|
365 |
+
found_files.append({
|
366 |
+
'url': href, # Use original URL
|
367 |
+
'filename': filename,
|
368 |
+
'size': size_str,
|
369 |
+
'metadata': {
|
370 |
+
'view_only': is_view_only,
|
371 |
+
'file_type': file_type,
|
372 |
+
'file_id': file_id
|
373 |
+
}
|
374 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
seen_urls = set()
|
377 |
unique_files = []
|
|
|
394 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
395 |
counter += 1
|
396 |
os.makedirs(save_dir, exist_ok=True)
|
397 |
+
|
398 |
try:
|
399 |
# Special handling for Google Drive files
|
400 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
401 |
+
# Check if it's marked as view-only in metadata
|
402 |
+
is_view_only = file_info.get('metadata', {}).get('view_only', False)
|
403 |
+
|
404 |
+
# For view-only files, try our most robust approach first
|
405 |
+
if is_view_only:
|
406 |
+
logger.info(f"Attempting to download view-only file: {file_url}")
|
407 |
+
result_path = await self.force_download_viewonly(file_info, path)
|
408 |
+
if result_path:
|
409 |
+
return result_path
|
410 |
+
|
411 |
+
# If that failed, try the regular download approach
|
412 |
+
logger.info("Primary method failed, trying fallback methods")
|
413 |
+
|
414 |
+
# Try regular download methods
|
415 |
success = await self.download_from_google_drive(file_url, path)
|
416 |
+
if success:
|
417 |
+
return path
|
418 |
|
419 |
+
# If all methods failed for Google Drive, try one last approach
|
420 |
+
logger.warning("All standard methods failed, attempting force download")
|
421 |
+
result_path = await self.force_download_viewonly(file_info, path)
|
422 |
+
return result_path if result_path else None
|
423 |
+
|
424 |
# Original code for non-Google Drive downloads
|
425 |
async with self.context.new_page() as page:
|
426 |
headers = {
|
|
|
441 |
logger.error(f"Error downloading {file_url}: {e}")
|
442 |
return None
|
443 |
|
444 |
+
async def force_download_viewonly(self, file_info, save_path):
|
445 |
+
"""Last-resort method to download view-only Google Drive files"""
|
446 |
+
try:
|
447 |
+
# Extract file ID from URL
|
448 |
+
file_id = None
|
449 |
+
url = file_info['url']
|
450 |
+
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
451 |
+
match = re.search(pattern, url)
|
452 |
+
if match:
|
453 |
+
file_id = match.group(1)
|
454 |
+
break
|
455 |
+
|
456 |
+
if not file_id:
|
457 |
+
logger.error("Could not extract file ID")
|
458 |
+
return None
|
459 |
+
|
460 |
+
logger.info(f"Force downloading view-only file with ID: {file_id}")
|
461 |
+
|
462 |
+
# Make sure we have the proper file extension
|
463 |
+
base, ext = os.path.splitext(save_path)
|
464 |
+
if not ext:
|
465 |
+
# Determine file type from metadata or set default to PDF
|
466 |
+
file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
|
467 |
+
save_path = f"{base}.{file_type}"
|
468 |
+
|
469 |
+
# Launch a new browser context with higher resolution
|
470 |
+
browser = await self.playwright.chromium.launch(
|
471 |
+
headless=True,
|
472 |
+
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
473 |
+
)
|
474 |
+
|
475 |
+
context = await browser.new_context(
|
476 |
+
viewport={'width': 1600, 'height': 1200},
|
477 |
+
user_agent=get_random_user_agent(),
|
478 |
+
device_scale_factor=2.0 # Higher resolution for better quality
|
479 |
+
)
|
480 |
+
|
481 |
+
page = await context.new_page()
|
482 |
+
|
483 |
+
# Navigate to the file
|
484 |
+
try:
|
485 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
|
486 |
+
wait_until='networkidle',
|
487 |
+
timeout=60000)
|
488 |
+
|
489 |
+
# Wait for content to load fully
|
490 |
+
await page.wait_for_timeout(5000)
|
491 |
+
|
492 |
+
# Check if it's a PDF
|
493 |
+
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
494 |
+
|
495 |
+
if is_pdf:
|
496 |
+
# For PDFs: Screenshot each page approach
|
497 |
+
logger.info("Detected PDF, using page-by-page screenshot approach")
|
498 |
+
|
499 |
+
# Scroll through document to ensure all pages are loaded
|
500 |
+
await page.evaluate("""
|
501 |
+
async function scrollDocument() {
|
502 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
503 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
504 |
+
if (!container) return;
|
505 |
+
|
506 |
+
// First scroll to bottom to load all pages
|
507 |
+
container.scrollTo(0, container.scrollHeight);
|
508 |
+
await delay(2000);
|
509 |
+
|
510 |
+
// Then back to top
|
511 |
+
container.scrollTo(0, 0);
|
512 |
+
await delay(1000);
|
513 |
+
}
|
514 |
+
return scrollDocument();
|
515 |
+
""")
|
516 |
+
|
517 |
+
# Count pages
|
518 |
+
page_count = await page.evaluate("""
|
519 |
+
() => {
|
520 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
521 |
+
return pages.length;
|
522 |
+
}
|
523 |
+
""")
|
524 |
+
|
525 |
+
if page_count == 0:
|
526 |
+
logger.warning("No pages found, trying alternative method")
|
527 |
+
# Take a screenshot of the entire page
|
528 |
+
temp_dir = tempfile.mkdtemp()
|
529 |
+
screenshot_path = os.path.join(temp_dir, "page.png")
|
530 |
+
await page.screenshot(path=screenshot_path, full_page=True)
|
531 |
+
|
532 |
+
# Convert screenshot to PDF
|
533 |
+
from PIL import Image
|
534 |
+
from reportlab.pdfgen import canvas as pdf_canvas
|
535 |
+
img = Image.open(screenshot_path)
|
536 |
+
width, height = img.size
|
537 |
+
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
538 |
+
c.drawImage(screenshot_path, 0, 0, width, height)
|
539 |
+
c.save()
|
540 |
+
|
541 |
+
# Clean up
|
542 |
+
os.remove(screenshot_path)
|
543 |
+
os.rmdir(temp_dir)
|
544 |
+
else:
|
545 |
+
# Create temp directory for page screenshots
|
546 |
+
temp_dir = tempfile.mkdtemp()
|
547 |
+
screenshots = []
|
548 |
+
|
549 |
+
# Take screenshot of each page
|
550 |
+
for i in range(page_count):
|
551 |
+
# Scroll to page
|
552 |
+
await page.evaluate(f"""
|
553 |
+
async () => {{
|
554 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
555 |
+
if (pages.length <= {i}) return;
|
556 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
557 |
+
pages[{i}].scrollIntoView();
|
558 |
+
await delay(500);
|
559 |
+
}}
|
560 |
+
""")
|
561 |
+
|
562 |
+
# Take screenshot
|
563 |
+
screenshot_path = os.path.join(temp_dir, f"page_{i+1}.png")
|
564 |
+
|
565 |
+
# Position page for best screenshot
|
566 |
+
await page.evaluate(f"""
|
567 |
+
() => {{
|
568 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
569 |
+
const page = pages[{i}];
|
570 |
+
const viewer = document.querySelector('.drive-viewer-paginated-scrollable');
|
571 |
+
if (page && viewer) {{
|
572 |
+
// Center the page in the viewport
|
573 |
+
const rect = page.getBoundingClientRect();
|
574 |
+
viewer.scrollBy(0, rect.top - 100);
|
575 |
+
}}
|
576 |
+
}}
|
577 |
+
""")
|
578 |
+
|
579 |
+
await page.screenshot(path=screenshot_path)
|
580 |
+
screenshots.append(screenshot_path)
|
581 |
+
|
582 |
+
# Combine screenshots into PDF
|
583 |
+
from reportlab.lib.pagesizes import letter
|
584 |
+
from reportlab.pdfgen import canvas as pdf_canvas
|
585 |
+
from PIL import Image
|
586 |
+
|
587 |
+
# Use the first image dimensions to determine page size
|
588 |
+
img = Image.open(screenshots[0])
|
589 |
+
img_width, img_height = img.size
|
590 |
+
|
591 |
+
c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
|
592 |
+
|
593 |
+
for screenshot in screenshots:
|
594 |
+
img = Image.open(screenshot)
|
595 |
+
c.drawImage(screenshot, 0, 0, img_width, img_height)
|
596 |
+
c.showPage()
|
597 |
+
|
598 |
+
c.save()
|
599 |
+
|
600 |
+
# Clean up
|
601 |
+
for screenshot in screenshots:
|
602 |
+
os.remove(screenshot)
|
603 |
+
os.rmdir(temp_dir)
|
604 |
+
else:
|
605 |
+
# For other file types: Take a single screenshot
|
606 |
+
temp_dir = tempfile.mkdtemp()
|
607 |
+
screenshot_path = os.path.join(temp_dir, "screenshot.png")
|
608 |
+
await page.screenshot(path=screenshot_path, full_page=True)
|
609 |
+
|
610 |
+
# Determine final file type
|
611 |
+
base, ext = os.path.splitext(save_path)
|
612 |
+
|
613 |
+
if ext.lower() == '.pdf':
|
614 |
+
# Convert to PDF
|
615 |
+
from PIL import Image
|
616 |
+
from reportlab.pdfgen import canvas as pdf_canvas
|
617 |
+
img = Image.open(screenshot_path)
|
618 |
+
width, height = img.size
|
619 |
+
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
|
620 |
+
c.drawImage(screenshot_path, 0, 0, width, height)
|
621 |
+
c.save()
|
622 |
+
else:
|
623 |
+
# Just copy the screenshot with the appropriate extension
|
624 |
+
shutil.copy(screenshot_path, save_path)
|
625 |
+
|
626 |
+
# Clean up
|
627 |
+
os.remove(screenshot_path)
|
628 |
+
os.rmdir(temp_dir)
|
629 |
+
|
630 |
+
# Close browser
|
631 |
+
await browser.close()
|
632 |
+
|
633 |
+
# Verify file exists and is not empty
|
634 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
635 |
+
logger.info(f"Successfully downloaded view-only file to {save_path}")
|
636 |
+
return save_path
|
637 |
+
else:
|
638 |
+
logger.error(f"Failed to create valid file at {save_path}")
|
639 |
+
return None
|
640 |
+
|
641 |
+
except Exception as e:
|
642 |
+
logger.error(f"Error during force download: {e}")
|
643 |
+
if browser:
|
644 |
+
await browser.close()
|
645 |
+
return None
|
646 |
+
|
647 |
+
except Exception as e:
|
648 |
+
logger.error(f"Force download failed: {e}")
|
649 |
+
return None
|
650 |
+
|
651 |
async def download_from_google_drive(self, url, save_path):
|
652 |
"""Enhanced method to download from Google Drive with multiple fallback approaches"""
|
653 |
# Extract the file ID from different URL formats
|
|
|
755 |
except Exception as e:
|
756 |
logger.warning(f"Requests session download failed: {e}")
|
757 |
|
758 |
+
logger.warning("Standard download methods failed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
759 |
return False
|
760 |
|
761 |
async def get_google_drive_file_info(self, file_id):
|