Update app.py
Browse files
app.py
CHANGED
@@ -37,7 +37,7 @@ from reportlab.lib.pagesizes import letter
|
|
37 |
from reportlab.pdfgen import canvas
|
38 |
from sklearn.cluster import KMeans
|
39 |
import numpy as np
|
40 |
-
|
41 |
# -------------------- Logging Setup --------------------
|
42 |
logging.basicConfig(
|
43 |
filename='advanced_download_log.txt',
|
@@ -388,12 +388,12 @@ class DownloadManager:
|
|
388 |
counter += 1
|
389 |
os.makedirs(save_dir, exist_ok=True)
|
390 |
try:
|
391 |
-
if "drive.google.com" in file_url:
|
392 |
-
|
393 |
-
|
394 |
-
if
|
395 |
-
|
396 |
-
|
397 |
async with self.context.new_page() as page:
|
398 |
headers = {
|
399 |
'Accept': '*/*',
|
@@ -413,6 +413,275 @@ class DownloadManager:
|
|
413 |
logger.error(f"Error downloading {file_url}: {e}")
|
414 |
return None
|
415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
417 |
if not custom_ext_list:
|
418 |
custom_ext_list = []
|
|
|
37 |
from reportlab.pdfgen import canvas
|
38 |
from sklearn.cluster import KMeans
|
39 |
import numpy as np
|
40 |
+
import base64
|
41 |
# -------------------- Logging Setup --------------------
|
42 |
logging.basicConfig(
|
43 |
filename='advanced_download_log.txt',
|
|
|
388 |
counter += 1
|
389 |
os.makedirs(save_dir, exist_ok=True)
|
390 |
try:
|
391 |
+
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
392 |
+
# Use enhanced Google Drive downloader
|
393 |
+
success = await self.download_from_google_drive(file_url, path)
|
394 |
+
return path if success else None
|
395 |
+
|
396 |
+
# Original code for non-Google Drive downloads
|
397 |
async with self.context.new_page() as page:
|
398 |
headers = {
|
399 |
'Accept': '*/*',
|
|
|
413 |
logger.error(f"Error downloading {file_url}: {e}")
|
414 |
return None
|
415 |
|
416 |
+
async def download_from_google_drive(self, url, save_path):
|
417 |
+
"""Enhanced method to download from Google Drive with multiple fallback approaches"""
|
418 |
+
# Extract the file ID from different URL formats
|
419 |
+
file_id = None
|
420 |
+
url_patterns = [
|
421 |
+
r'drive\.google\.com/file/d/([^/]+)',
|
422 |
+
r'drive\.google\.com/open\?id=([^&]+)',
|
423 |
+
r'docs\.google\.com/\w+/d/([^/]+)',
|
424 |
+
r'id=([^&]+)',
|
425 |
+
r'drive\.google\.com/uc\?id=([^&]+)',
|
426 |
+
]
|
427 |
+
|
428 |
+
for pattern in url_patterns:
|
429 |
+
match = re.search(pattern, url)
|
430 |
+
if match:
|
431 |
+
file_id = match.group(1)
|
432 |
+
break
|
433 |
+
|
434 |
+
if not file_id:
|
435 |
+
logger.error(f"Could not extract file ID from URL: {url}")
|
436 |
+
return False
|
437 |
+
|
438 |
+
# Approach 1: Try with gdown first (when it works)
|
439 |
+
try:
|
440 |
+
import gdown
|
441 |
+
output = gdown.download(url, save_path, quiet=False, fuzzy=True)
|
442 |
+
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
443 |
+
logger.info(f"Successfully downloaded with gdown: {url}")
|
444 |
+
return True
|
445 |
+
except Exception as e:
|
446 |
+
logger.warning(f"gdown download failed: {e}")
|
447 |
+
|
448 |
+
# Approach 2: Use Playwright session with cookies
|
449 |
+
try:
|
450 |
+
async with self.context.new_page() as page:
|
451 |
+
# Visit the file viewing page to get cookies
|
452 |
+
view_url = f"https://drive.google.com/file/d/{file_id}/view"
|
453 |
+
await page.goto(view_url, wait_until='networkidle', timeout=60000)
|
454 |
+
|
455 |
+
# Check for view-only permissions
|
456 |
+
if await page.query_selector('text="the owner has not granted you permission to download this file"'):
|
457 |
+
logger.warning("File has view-only permissions, attempting workaround")
|
458 |
+
|
459 |
+
# Check if it's a PDF (we can use the JS method)
|
460 |
+
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
461 |
+
if is_pdf:
|
462 |
+
# Try JavaScript PDF capture approach for PDFs
|
463 |
+
success = await self.download_viewonly_pdf_with_js(page, save_path)
|
464 |
+
if success:
|
465 |
+
return True
|
466 |
+
|
467 |
+
# Try direct download attempt for view-only files
|
468 |
+
cookies = await page.context.cookies()
|
469 |
+
cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
|
470 |
+
|
471 |
+
# Try download URL with custom headers and cookies
|
472 |
+
download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
|
473 |
+
await page.goto(download_url, wait_until='networkidle', timeout=60000)
|
474 |
+
|
475 |
+
headers = {
|
476 |
+
'User-Agent': get_random_user_agent(),
|
477 |
+
'Cookie': cookie_str,
|
478 |
+
'Accept': '*/*',
|
479 |
+
}
|
480 |
+
|
481 |
+
response = await page.request.get(download_url, headers=headers)
|
482 |
+
if response.status == 200:
|
483 |
+
content = await response.body()
|
484 |
+
with open(save_path, 'wb') as f:
|
485 |
+
f.write(content)
|
486 |
+
return True
|
487 |
+
|
488 |
+
# Standard download flow for files with download permission
|
489 |
+
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
490 |
+
await page.goto(download_url, wait_until='networkidle', timeout=60000)
|
491 |
+
|
492 |
+
# Handle large files with confirmation
|
493 |
+
confirm_form = await page.query_selector('form#download-form')
|
494 |
+
if confirm_form:
|
495 |
+
await confirm_form.evaluate('form => form.submit()')
|
496 |
+
await page.wait_for_load_state('networkidle')
|
497 |
+
|
498 |
+
# Get cookies after confirmation
|
499 |
+
cookies = await page.context.cookies()
|
500 |
+
cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
|
501 |
+
|
502 |
+
# Get final download URL with confirmation token
|
503 |
+
download_url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
|
504 |
+
|
505 |
+
response = await page.request.get(download_url, headers={'Cookie': cookie_str})
|
506 |
+
if response.status == 200:
|
507 |
+
content = await response.body()
|
508 |
+
with open(save_path, 'wb') as f:
|
509 |
+
f.write(content)
|
510 |
+
return True
|
511 |
+
except Exception as e:
|
512 |
+
logger.warning(f"Playwright download approach failed: {e}")
|
513 |
+
|
514 |
+
# Approach 3: Try with requests and session cookies
|
515 |
+
try:
|
516 |
+
import requests
|
517 |
+
|
518 |
+
session = requests.Session()
|
519 |
+
session.headers.update({'User-Agent': get_random_user_agent()})
|
520 |
+
|
521 |
+
# Get the initial page to obtain cookies
|
522 |
+
url = f"https://drive.google.com/uc?id={file_id}&export=download"
|
523 |
+
response = session.get(url, stream=True, timeout=30)
|
524 |
+
|
525 |
+
# Check for the download confirmation
|
526 |
+
confirmation_token = None
|
527 |
+
for k, v in response.cookies.items():
|
528 |
+
if k.startswith('download_warning'):
|
529 |
+
confirmation_token = v
|
530 |
+
break
|
531 |
+
|
532 |
+
# Use the confirmation token if found
|
533 |
+
if confirmation_token:
|
534 |
+
url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm={confirmation_token}"
|
535 |
+
|
536 |
+
# Download the file
|
537 |
+
response = session.get(url, stream=True, timeout=60)
|
538 |
+
with open(save_path, 'wb') as f:
|
539 |
+
for chunk in response.iter_content(chunk_size=1024*1024):
|
540 |
+
if chunk:
|
541 |
+
f.write(chunk)
|
542 |
+
|
543 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
544 |
+
return True
|
545 |
+
except Exception as e:
|
546 |
+
logger.warning(f"Requests session download failed: {e}")
|
547 |
+
|
548 |
+
# All approaches failed
|
549 |
+
logger.error(f"All download attempts failed for: {url}")
|
550 |
+
return False
|
551 |
+
|
552 |
+
async def download_viewonly_pdf_with_js(self, page, save_path):
|
553 |
+
"""Use JavaScript approach to download view-only PDFs from Google Drive"""
|
554 |
+
try:
|
555 |
+
logger.info("Attempting to download view-only PDF using JavaScript method")
|
556 |
+
|
557 |
+
# Scroll to ensure all pages are loaded
|
558 |
+
await page.evaluate("""
|
559 |
+
async function scrollToBottom() {
|
560 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
561 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
562 |
+
if (!container) return;
|
563 |
+
|
564 |
+
const scrollHeight = container.scrollHeight;
|
565 |
+
const viewportHeight = container.clientHeight;
|
566 |
+
const scrollStep = viewportHeight / 2;
|
567 |
+
|
568 |
+
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
|
569 |
+
container.scrollTo(0, scrollPos);
|
570 |
+
await delay(500);
|
571 |
+
}
|
572 |
+
|
573 |
+
// Final scroll to ensure we reached the bottom
|
574 |
+
container.scrollTo(0, scrollHeight);
|
575 |
+
await delay(1000);
|
576 |
+
}
|
577 |
+
|
578 |
+
return scrollToBottom();
|
579 |
+
""")
|
580 |
+
|
581 |
+
# Wait for a moment to ensure all images are loaded
|
582 |
+
await page.wait_for_timeout(3000)
|
583 |
+
|
584 |
+
# Inject the jsPDF library
|
585 |
+
await page.evaluate("""
|
586 |
+
return new Promise((resolve, reject) => {
|
587 |
+
const script = document.createElement('script');
|
588 |
+
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
|
589 |
+
script.onload = () => resolve(true);
|
590 |
+
script.onerror = () => reject(new Error('Failed to load jsPDF'));
|
591 |
+
document.head.appendChild(script);
|
592 |
+
});
|
593 |
+
""")
|
594 |
+
|
595 |
+
# Wait for the library to load
|
596 |
+
await page.wait_for_timeout(1000)
|
597 |
+
|
598 |
+
# Execute the PDF creation script
|
599 |
+
pdf_data = await page.evaluate("""
|
600 |
+
return new Promise(async (resolve) => {
|
601 |
+
// Make sure jsPDF is loaded
|
602 |
+
if (typeof window.jspdf === 'undefined') {
|
603 |
+
window.jspdf = window.jspdf || {};
|
604 |
+
}
|
605 |
+
|
606 |
+
// Use the jsPDF library
|
607 |
+
const { jsPDF } = window.jspdf;
|
608 |
+
const pdf = new jsPDF();
|
609 |
+
|
610 |
+
const images = Array.from(document.querySelectorAll('img')).filter(img =>
|
611 |
+
img.src.startsWith('blob:') && img.width > 100 && img.height > 100
|
612 |
+
);
|
613 |
+
|
614 |
+
if (images.length === 0) {
|
615 |
+
resolve(null);
|
616 |
+
return;
|
617 |
+
}
|
618 |
+
|
619 |
+
for (let i = 0; i < images.length; i++) {
|
620 |
+
const img = images[i];
|
621 |
+
|
622 |
+
// Create canvas and draw image
|
623 |
+
const canvas = document.createElement('canvas');
|
624 |
+
canvas.width = img.width;
|
625 |
+
canvas.height = img.height;
|
626 |
+
const ctx = canvas.getContext('2d');
|
627 |
+
ctx.drawImage(img, 0, 0, img.width, img.height);
|
628 |
+
|
629 |
+
// Add image to PDF
|
630 |
+
const imgData = canvas.toDataURL('image/jpeg', 1.0);
|
631 |
+
|
632 |
+
// Add a new page for each image except the first one
|
633 |
+
if (i > 0) {
|
634 |
+
pdf.addPage();
|
635 |
+
}
|
636 |
+
|
637 |
+
// Calculate dimensions to fit page
|
638 |
+
const pageWidth = pdf.internal.pageSize.getWidth();
|
639 |
+
const pageHeight = pdf.internal.pageSize.getHeight();
|
640 |
+
const imgRatio = img.height / img.width;
|
641 |
+
|
642 |
+
let imgWidth = pageWidth;
|
643 |
+
let imgHeight = imgWidth * imgRatio;
|
644 |
+
|
645 |
+
// If height exceeds page, scale down
|
646 |
+
if (imgHeight > pageHeight) {
|
647 |
+
imgHeight = pageHeight;
|
648 |
+
imgWidth = imgHeight / imgRatio;
|
649 |
+
}
|
650 |
+
|
651 |
+
// Center image on page
|
652 |
+
const x = (pageWidth - imgWidth) / 2;
|
653 |
+
const y = (pageHeight - imgHeight) / 2;
|
654 |
+
|
655 |
+
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
656 |
+
}
|
657 |
+
|
658 |
+
// Get the PDF as base64
|
659 |
+
const pdfBase64 = pdf.output('datauristring');
|
660 |
+
resolve(pdfBase64);
|
661 |
+
});
|
662 |
+
""")
|
663 |
+
|
664 |
+
if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
|
665 |
+
logger.warning("Failed to generate PDF with JavaScript method")
|
666 |
+
return False
|
667 |
+
|
668 |
+
# Extract the base64 data and save to file
|
669 |
+
base64_data = pdf_data.replace('data:application/pdf;base64,', '')
|
670 |
+
pdf_bytes = base64.b64decode(base64_data)
|
671 |
+
|
672 |
+
with open(save_path, 'wb') as f:
|
673 |
+
f.write(pdf_bytes)
|
674 |
+
|
675 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
676 |
+
logger.info("Successfully downloaded view-only PDF using JavaScript method")
|
677 |
+
return True
|
678 |
+
else:
|
679 |
+
return False
|
680 |
+
|
681 |
+
except Exception as e:
|
682 |
+
logger.error(f"Error in JavaScript PDF download method: {e}")
|
683 |
+
return False
|
684 |
+
|
685 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
686 |
if not custom_ext_list:
|
687 |
custom_ext_list = []
|