Update app.py
Browse files
app.py
CHANGED
@@ -38,6 +38,10 @@ from reportlab.pdfgen import canvas
|
|
38 |
from sklearn.cluster import KMeans
|
39 |
import numpy as np
|
40 |
import base64
|
|
|
|
|
|
|
|
|
41 |
# -------------------- Logging Setup --------------------
|
42 |
logging.basicConfig(
|
43 |
filename='advanced_download_log.txt',
|
@@ -348,23 +352,29 @@ class DownloadManager:
|
|
348 |
file_id = match.group(1)
|
349 |
break
|
350 |
if file_id:
|
351 |
-
|
352 |
-
filename = file_id
|
353 |
try:
|
354 |
-
|
355 |
-
|
356 |
-
if
|
357 |
-
|
358 |
-
|
359 |
-
filename = mt.group(1).strip('"').strip()
|
360 |
found_files.append({
|
361 |
-
'url':
|
362 |
'filename': filename,
|
363 |
-
'size': await self.get_file_size(
|
364 |
-
'metadata': {}
|
365 |
})
|
366 |
except Exception as e:
|
367 |
logger.error(f"Error processing Google Drive link: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
|
369 |
seen_urls = set()
|
370 |
unique_files = []
|
@@ -388,6 +398,7 @@ class DownloadManager:
|
|
388 |
counter += 1
|
389 |
os.makedirs(save_dir, exist_ok=True)
|
390 |
try:
|
|
|
391 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
392 |
# Use enhanced Google Drive downloader
|
393 |
success = await self.download_from_google_drive(file_url, path)
|
@@ -435,251 +446,672 @@ class DownloadManager:
|
|
435 |
logger.error(f"Could not extract file ID from URL: {url}")
|
436 |
return False
|
437 |
|
438 |
-
#
|
439 |
-
|
440 |
-
|
441 |
-
output = gdown.download(url, save_path, quiet=False, fuzzy=True)
|
442 |
-
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
443 |
-
logger.info(f"Successfully downloaded with gdown: {url}")
|
444 |
-
return True
|
445 |
-
except Exception as e:
|
446 |
-
logger.warning(f"gdown download failed: {e}")
|
447 |
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
|
461 |
-
if is_pdf:
|
462 |
-
# Try JavaScript PDF capture approach for PDFs
|
463 |
-
success = await self.download_viewonly_pdf_with_js(page, save_path)
|
464 |
-
if success:
|
465 |
-
return True
|
466 |
-
|
467 |
-
# Try direct download attempt for view-only files
|
468 |
-
cookies = await page.context.cookies()
|
469 |
-
cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
|
470 |
-
|
471 |
-
# Try download URL with custom headers and cookies
|
472 |
-
download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
|
473 |
-
await page.goto(download_url, wait_until='networkidle', timeout=60000)
|
474 |
-
|
475 |
-
headers = {
|
476 |
-
'User-Agent': get_random_user_agent(),
|
477 |
-
'Cookie': cookie_str,
|
478 |
-
'Accept': '*/*',
|
479 |
-
}
|
480 |
-
|
481 |
-
response = await page.request.get(download_url, headers=headers)
|
482 |
-
if response.status == 200:
|
483 |
-
content = await response.body()
|
484 |
-
with open(save_path, 'wb') as f:
|
485 |
-
f.write(content)
|
486 |
-
return True
|
487 |
-
|
488 |
-
# Standard download flow for files with download permission
|
489 |
-
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
490 |
-
await page.goto(download_url, wait_until='networkidle', timeout=60000)
|
491 |
-
|
492 |
-
# Handle large files with confirmation
|
493 |
-
confirm_form = await page.query_selector('form#download-form')
|
494 |
-
if confirm_form:
|
495 |
-
await confirm_form.evaluate('form => form.submit()')
|
496 |
-
await page.wait_for_load_state('networkidle')
|
497 |
-
|
498 |
-
# Get cookies after confirmation
|
499 |
-
cookies = await page.context.cookies()
|
500 |
-
cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
|
501 |
|
502 |
-
|
503 |
-
|
|
|
|
|
|
|
504 |
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
return True
|
511 |
except Exception as e:
|
512 |
-
logger.warning(f"
|
513 |
|
514 |
-
#
|
515 |
try:
|
516 |
-
import requests
|
517 |
-
|
518 |
session = requests.Session()
|
519 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
520 |
|
521 |
-
#
|
|
|
|
|
|
|
522 |
url = f"https://drive.google.com/uc?id={file_id}&export=download"
|
523 |
response = session.get(url, stream=True, timeout=30)
|
524 |
|
525 |
-
# Check for
|
526 |
confirmation_token = None
|
527 |
for k, v in response.cookies.items():
|
528 |
if k.startswith('download_warning'):
|
529 |
confirmation_token = v
|
530 |
break
|
531 |
|
532 |
-
# Use
|
533 |
if confirmation_token:
|
534 |
-
url = f"
|
535 |
-
|
536 |
-
# Download the file
|
537 |
-
response = session.get(url, stream=True, timeout=60)
|
538 |
-
with open(save_path, 'wb') as f:
|
539 |
-
for chunk in response.iter_content(chunk_size=1024*1024):
|
540 |
-
if chunk:
|
541 |
-
f.write(chunk)
|
542 |
|
543 |
-
if
|
544 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
except Exception as e:
|
546 |
logger.warning(f"Requests session download failed: {e}")
|
547 |
|
548 |
-
#
|
549 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
return False
|
551 |
|
552 |
-
async def
|
553 |
-
"""
|
|
|
|
|
|
|
554 |
try:
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
570 |
await delay(500);
|
|
|
|
|
571 |
}
|
572 |
-
|
573 |
-
|
574 |
-
container.scrollTo(0, scrollHeight);
|
575 |
-
await delay(1000);
|
576 |
-
}
|
577 |
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
|
606 |
-
|
607 |
-
|
608 |
-
const pdf = new jsPDF();
|
609 |
-
|
610 |
-
const images = Array.from(document.querySelectorAll('img')).filter(img =>
|
611 |
-
img.src.startsWith('blob:') && img.width > 100 && img.height > 100
|
612 |
-
);
|
613 |
|
614 |
-
|
615 |
-
|
616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
}
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
644 |
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
650 |
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
|
655 |
-
|
656 |
-
|
|
|
|
|
|
|
|
|
657 |
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
667 |
|
668 |
-
|
669 |
-
|
670 |
-
pdf_bytes = base64.b64decode(base64_data)
|
671 |
|
672 |
-
|
673 |
-
f.
|
|
|
|
|
|
|
|
|
674 |
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
681 |
except Exception as e:
|
682 |
-
logger.error(f"Error
|
683 |
return False
|
684 |
|
685 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
|
|
38 |
from sklearn.cluster import KMeans
|
39 |
import numpy as np
|
40 |
import base64
|
41 |
+
import shutil
|
42 |
+
from PIL import Image # Make sure to pip install Pillow
|
43 |
+
from reportlab.pdfgen import canvas
|
44 |
+
|
45 |
# -------------------- Logging Setup --------------------
|
46 |
logging.basicConfig(
|
47 |
filename='advanced_download_log.txt',
|
|
|
352 |
file_id = match.group(1)
|
353 |
break
|
354 |
if file_id:
|
355 |
+
# We'll detect file type during download, so just use the ID for filename initially
|
356 |
+
filename = f"gdrive_{file_id}"
|
357 |
try:
|
358 |
+
# Get file info to determine type and size
|
359 |
+
file_type, is_view_only = await self.get_google_drive_file_info(file_id)
|
360 |
+
if file_type:
|
361 |
+
filename = f"{filename}.{file_type}"
|
362 |
+
|
|
|
363 |
found_files.append({
|
364 |
+
'url': href, # Use original URL, as we'll process it specially
|
365 |
'filename': filename,
|
366 |
+
'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"),
|
367 |
+
'metadata': {'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id}
|
368 |
})
|
369 |
except Exception as e:
|
370 |
logger.error(f"Error processing Google Drive link: {e}")
|
371 |
+
# Fallback if we can't get info
|
372 |
+
found_files.append({
|
373 |
+
'url': href,
|
374 |
+
'filename': filename,
|
375 |
+
'size': "Unknown Size",
|
376 |
+
'metadata': {'file_id': file_id}
|
377 |
+
})
|
378 |
|
379 |
seen_urls = set()
|
380 |
unique_files = []
|
|
|
398 |
counter += 1
|
399 |
os.makedirs(save_dir, exist_ok=True)
|
400 |
try:
|
401 |
+
# Special handling for Google Drive files
|
402 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
403 |
# Use enhanced Google Drive downloader
|
404 |
success = await self.download_from_google_drive(file_url, path)
|
|
|
446 |
logger.error(f"Could not extract file ID from URL: {url}")
|
447 |
return False
|
448 |
|
449 |
+
# Determine file type first (important for handling different file types)
|
450 |
+
file_type, is_view_only = await self.get_google_drive_file_info(file_id)
|
451 |
+
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
|
453 |
+
base, ext = os.path.splitext(save_path)
|
454 |
+
if not ext and file_type:
|
455 |
+
# Add the correct extension if missing
|
456 |
+
save_path = f"{base}.{file_type}"
|
457 |
+
|
458 |
+
# For view-only files, use specialized approaches
|
459 |
+
if is_view_only:
|
460 |
+
# Approach 1: For PDFs, use the JS method
|
461 |
+
if file_type == 'pdf':
|
462 |
+
success = await self.download_viewonly_pdf_with_js(file_id, save_path)
|
463 |
+
if success:
|
464 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
+
# Approach 2: For Google Docs, Sheets, etc., use export API
|
467 |
+
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
|
468 |
+
success = await self.export_google_doc(file_id, file_type, save_path)
|
469 |
+
if success:
|
470 |
+
return True
|
471 |
|
472 |
+
# Approach 3: Try the direct screenshot method for any view-only file
|
473 |
+
success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
|
474 |
+
if success:
|
475 |
+
return True
|
476 |
+
|
477 |
+
# Try standard approaches for non-view-only files
|
478 |
+
try:
|
479 |
+
# Try with gdown first
|
480 |
+
import gdown
|
481 |
+
output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
|
482 |
+
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
483 |
+
with open(save_path, 'rb') as f:
|
484 |
+
content = f.read(100) # Read first 100 bytes
|
485 |
+
if b'<!DOCTYPE html>' not in content: # Check not HTML error page
|
486 |
+
logger.info(f"Successfully downloaded with gdown: {url}")
|
487 |
return True
|
488 |
except Exception as e:
|
489 |
+
logger.warning(f"gdown download failed: {e}")
|
490 |
|
491 |
+
# Try with requests and session cookies
|
492 |
try:
|
|
|
|
|
493 |
session = requests.Session()
|
494 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
495 |
|
496 |
+
# Visit the page first to get cookies
|
497 |
+
session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
|
498 |
+
|
499 |
+
# Try download
|
500 |
url = f"https://drive.google.com/uc?id={file_id}&export=download"
|
501 |
response = session.get(url, stream=True, timeout=30)
|
502 |
|
503 |
+
# Check for confirmation token
|
504 |
confirmation_token = None
|
505 |
for k, v in response.cookies.items():
|
506 |
if k.startswith('download_warning'):
|
507 |
confirmation_token = v
|
508 |
break
|
509 |
|
510 |
+
# Use confirmation token if found
|
511 |
if confirmation_token:
|
512 |
+
url = f"{url}&confirm={confirmation_token}"
|
513 |
+
response = session.get(url, stream=True, timeout=60)
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
|
515 |
+
# Check if we're getting HTML instead of the file
|
516 |
+
content_type = response.headers.get('Content-Type', '')
|
517 |
+
if 'text/html' in content_type:
|
518 |
+
logger.warning("Received HTML instead of file - likely download restriction")
|
519 |
+
else:
|
520 |
+
with open(save_path, 'wb') as f:
|
521 |
+
for chunk in response.iter_content(chunk_size=1024*1024):
|
522 |
+
if chunk:
|
523 |
+
f.write(chunk)
|
524 |
+
|
525 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
526 |
+
with open(save_path, 'rb') as f:
|
527 |
+
content = f.read(100)
|
528 |
+
if b'<!DOCTYPE html>' not in content:
|
529 |
+
logger.info("Successfully downloaded with requests session")
|
530 |
+
return True
|
531 |
except Exception as e:
|
532 |
logger.warning(f"Requests session download failed: {e}")
|
533 |
|
534 |
+
# If all methods failed for view-only file, try one last approach
|
535 |
+
if is_view_only:
|
536 |
+
try:
|
537 |
+
# Try a direct headless browser download
|
538 |
+
async with self.context.new_page() as page:
|
539 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
|
540 |
+
|
541 |
+
# Try to capture the content directly from viewer
|
542 |
+
file_content = await page.evaluate("""
|
543 |
+
() => {
|
544 |
+
// Try to find the actual viewer content
|
545 |
+
const viewerContent = document.querySelector('.drive-viewer-paginated-content');
|
546 |
+
if (viewerContent) {
|
547 |
+
return viewerContent.innerHTML;
|
548 |
+
}
|
549 |
+
return document.documentElement.innerHTML;
|
550 |
+
}
|
551 |
+
""")
|
552 |
+
|
553 |
+
if file_content:
|
554 |
+
# Save as HTML and then we can convert it if needed
|
555 |
+
html_path = f"{base}.html"
|
556 |
+
with open(html_path, 'w', encoding='utf-8') as f:
|
557 |
+
f.write(f"""
|
558 |
+
<!DOCTYPE html>
|
559 |
+
<html>
|
560 |
+
<head><title>Google Drive Extracted Content</title></head>
|
561 |
+
<body>
|
562 |
+
{file_content}
|
563 |
+
</body>
|
564 |
+
</html>
|
565 |
+
""")
|
566 |
+
|
567 |
+
# If requested a PDF, convert HTML to PDF
|
568 |
+
if file_type == 'pdf' or ext.lower() == '.pdf':
|
569 |
+
try:
|
570 |
+
import pdfkit
|
571 |
+
pdfkit.from_file(html_path, save_path)
|
572 |
+
os.remove(html_path) # Clean up HTML file
|
573 |
+
return True
|
574 |
+
except Exception as pdf_err:
|
575 |
+
logger.warning(f"Error converting HTML to PDF: {pdf_err}")
|
576 |
+
# Keep the HTML file as fallback
|
577 |
+
shutil.copy(html_path, save_path)
|
578 |
+
return True
|
579 |
+
else:
|
580 |
+
# Just use the HTML file
|
581 |
+
shutil.copy(html_path, save_path)
|
582 |
+
return True
|
583 |
+
except Exception as e:
|
584 |
+
logger.warning(f"Final direct browser capture failed: {e}")
|
585 |
+
|
586 |
+
# All methods failed
|
587 |
+
logger.error(f"All download approaches failed for Google Drive file: {file_id}")
|
588 |
return False
|
589 |
|
590 |
+
async def get_google_drive_file_info(self, file_id):
|
591 |
+
"""Get file type and view-only status from Google Drive"""
|
592 |
+
file_type = None
|
593 |
+
is_view_only = False
|
594 |
+
|
595 |
try:
|
596 |
+
async with self.context.new_page() as page:
|
597 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
|
598 |
+
|
599 |
+
# Check if view-only
|
600 |
+
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
|
601 |
+
is_view_only = view_only_text is not None
|
602 |
+
|
603 |
+
# Check for Google Docs viewer
|
604 |
+
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
|
605 |
+
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
|
606 |
+
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
|
607 |
+
|
608 |
+
if gdocs_viewer:
|
609 |
+
file_type = 'docx'
|
610 |
+
elif gsheets_viewer:
|
611 |
+
file_type = 'xlsx'
|
612 |
+
elif gslides_viewer:
|
613 |
+
file_type = 'pptx'
|
614 |
+
else:
|
615 |
+
# Check for PDF viewer
|
616 |
+
pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
|
617 |
+
if pdf_viewer:
|
618 |
+
file_type = 'pdf'
|
619 |
+
else:
|
620 |
+
# Check for image viewer
|
621 |
+
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
|
622 |
+
if img_viewer:
|
623 |
+
# Get image type from src
|
624 |
+
img_src = await img_viewer.get_attribute('src')
|
625 |
+
if 'jpg' in img_src or 'jpeg' in img_src:
|
626 |
+
file_type = 'jpg'
|
627 |
+
elif 'png' in img_src:
|
628 |
+
file_type = 'png'
|
629 |
+
else:
|
630 |
+
file_type = 'jpg' # Default to jpg
|
631 |
+
else:
|
632 |
+
# Generic file type fallback
|
633 |
+
file_type = 'pdf' # Default to PDF
|
634 |
+
|
635 |
+
# If still no type, check filename
|
636 |
+
if not file_type:
|
637 |
+
title_element = await page.query_selector('div[role="heading"]')
|
638 |
+
if title_element:
|
639 |
+
title = await title_element.text_content()
|
640 |
+
if title:
|
641 |
+
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
|
642 |
+
if ext_match:
|
643 |
+
file_type = ext_match.group(1).lower()
|
644 |
+
|
645 |
+
except Exception as e:
|
646 |
+
logger.error(f"Error getting Google Drive file info: {e}")
|
647 |
+
file_type = 'pdf' # Default to PDF if we can't determine
|
648 |
+
|
649 |
+
return file_type, is_view_only
|
650 |
+
|
651 |
+
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
652 |
+
"""Download view-only PDF using JavaScript approach - improved version"""
|
653 |
+
try:
|
654 |
+
async with self.context.new_page() as page:
|
655 |
+
# Set viewport size to ensure we capture full pages
|
656 |
+
await page.set_viewport_size({"width": 1200, "height": 1600})
|
657 |
+
|
658 |
+
# Visit the file
|
659 |
+
view_url = f"https://drive.google.com/file/d/{file_id}/view"
|
660 |
+
await page.goto(view_url, wait_until='networkidle', timeout=60000)
|
661 |
+
|
662 |
+
# Wait for rendering
|
663 |
+
await page.wait_for_timeout(2000)
|
664 |
+
|
665 |
+
# Inject required libraries - use CDN for jsPDF
|
666 |
+
await page.evaluate("""
|
667 |
+
async function injectLibraries() {
|
668 |
+
// Add jsPDF
|
669 |
+
return new Promise((resolve) => {
|
670 |
+
const jspdfScript = document.createElement('script');
|
671 |
+
jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
|
672 |
+
jspdfScript.onload = () => resolve(true);
|
673 |
+
document.head.appendChild(jspdfScript);
|
674 |
+
});
|
675 |
+
}
|
676 |
+
return injectLibraries();
|
677 |
+
""")
|
678 |
+
|
679 |
+
# Wait for libraries to load
|
680 |
+
await page.wait_for_timeout(2000)
|
681 |
+
|
682 |
+
# Scroll through document to load all pages
|
683 |
+
await page.evaluate("""
|
684 |
+
async function scrollThroughDocument() {
|
685 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
686 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
687 |
+
if (!container) return false;
|
688 |
+
|
689 |
+
const scrollHeight = container.scrollHeight;
|
690 |
+
const viewportHeight = container.clientHeight;
|
691 |
+
const scrollStep = viewportHeight / 2;
|
692 |
+
|
693 |
+
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
|
694 |
+
container.scrollTo(0, scrollPos);
|
695 |
+
await delay(500);
|
696 |
+
}
|
697 |
+
|
698 |
+
// One final scroll to bottom to ensure everything is loaded
|
699 |
+
container.scrollTo(0, scrollHeight);
|
700 |
+
await delay(1000);
|
701 |
+
|
702 |
+
// Scroll back to top for PDF creation
|
703 |
+
container.scrollTo(0, 0);
|
704 |
await delay(500);
|
705 |
+
|
706 |
+
return true;
|
707 |
}
|
708 |
+
return scrollThroughDocument();
|
709 |
+
""")
|
|
|
|
|
|
|
710 |
|
711 |
+
# Wait after scrolling
|
712 |
+
await page.wait_for_timeout(2000)
|
713 |
+
|
714 |
+
# Use the improved PDF creation script that captures all pages
|
715 |
+
pdf_base64 = await page.evaluate("""
|
716 |
+
async function createPDF() {
|
717 |
+
try {
|
718 |
+
// Make sure jsPDF is loaded
|
719 |
+
if (typeof window.jspdf === 'undefined') {
|
720 |
+
console.error('jsPDF not loaded');
|
721 |
+
return null;
|
722 |
+
}
|
723 |
+
|
724 |
+
const { jsPDF } = window.jspdf;
|
725 |
+
const pdf = new jsPDF();
|
726 |
+
|
727 |
+
// Get all page elements
|
728 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
729 |
+
console.log('Found pages:', pages.length);
|
730 |
+
|
731 |
+
if (pages.length === 0) {
|
732 |
+
// Alternative: try to find images directly
|
733 |
+
const images = Array.from(document.querySelectorAll('img')).filter(img =>
|
734 |
+
img.src.startsWith('blob:') && img.width > 100 && img.height > 100
|
735 |
+
);
|
736 |
+
|
737 |
+
console.log('Found images:', images.length);
|
738 |
+
|
739 |
+
if (images.length === 0) {
|
740 |
+
return null;
|
741 |
+
}
|
742 |
+
|
743 |
+
// Process each image
|
744 |
+
for (let i = 0; i < images.length; i++) {
|
745 |
+
const img = images[i];
|
746 |
+
|
747 |
+
if (i > 0) {
|
748 |
+
pdf.addPage();
|
749 |
+
}
|
750 |
+
|
751 |
+
// Create canvas and draw image
|
752 |
+
const canvas = document.createElement('canvas');
|
753 |
+
canvas.width = img.width;
|
754 |
+
canvas.height = img.height;
|
755 |
+
const ctx = canvas.getContext('2d');
|
756 |
+
ctx.drawImage(img, 0, 0, img.width, img.height);
|
757 |
+
|
758 |
+
// Add to PDF
|
759 |
+
const imgData = canvas.toDataURL('image/jpeg', 0.95);
|
760 |
+
|
761 |
+
// Calculate dimensions
|
762 |
+
const pageWidth = pdf.internal.pageSize.getWidth();
|
763 |
+
const pageHeight = pdf.internal.pageSize.getHeight();
|
764 |
+
const imgRatio = img.height / img.width;
|
765 |
+
|
766 |
+
let imgWidth = pageWidth - 10;
|
767 |
+
let imgHeight = imgWidth * imgRatio;
|
768 |
+
|
769 |
+
if (imgHeight > pageHeight - 10) {
|
770 |
+
imgHeight = pageHeight - 10;
|
771 |
+
imgWidth = imgHeight / imgRatio;
|
772 |
+
}
|
773 |
+
|
774 |
+
// Center on page
|
775 |
+
const x = (pageWidth - imgWidth) / 2;
|
776 |
+
const y = (pageHeight - imgHeight) / 2;
|
777 |
+
|
778 |
+
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
779 |
+
}
|
780 |
+
} else {
|
781 |
+
// Process each page
|
782 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
783 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
784 |
+
|
785 |
+
for (let i = 0; i < pages.length; i++) {
|
786 |
+
// Add a new page for each page after the first
|
787 |
+
if (i > 0) {
|
788 |
+
pdf.addPage();
|
789 |
+
}
|
790 |
+
|
791 |
+
// Scroll to the page and wait for it to render
|
792 |
+
pages[i].scrollIntoView();
|
793 |
+
await delay(300);
|
794 |
+
|
795 |
+
// Find the image element inside the page
|
796 |
+
const pageImages = pages[i].querySelectorAll('img');
|
797 |
+
let targetImage = null;
|
798 |
+
|
799 |
+
for (const img of pageImages) {
|
800 |
+
if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
|
801 |
+
targetImage = img;
|
802 |
+
break;
|
803 |
+
}
|
804 |
+
}
|
805 |
+
|
806 |
+
if (!targetImage) {
|
807 |
+
// If no image found, try taking a screenshot of the page instead
|
808 |
+
const pageCanvas = document.createElement('canvas');
|
809 |
+
pageCanvas.width = pages[i].clientWidth;
|
810 |
+
pageCanvas.height = pages[i].clientHeight;
|
811 |
+
const ctx = pageCanvas.getContext('2d');
|
812 |
+
|
813 |
+
// Draw the page background
|
814 |
+
ctx.fillStyle = 'white';
|
815 |
+
ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
|
816 |
+
|
817 |
+
// Use html2canvas approach
|
818 |
+
try {
|
819 |
+
await delay(100);
|
820 |
+
// Just draw what we can see
|
821 |
+
const allElements = pages[i].querySelectorAll('*');
|
822 |
+
for (const el of allElements) {
|
823 |
+
if (el.tagName === 'IMG' && el.complete && el.src) {
|
824 |
+
const rect = el.getBoundingClientRect();
|
825 |
+
try {
|
826 |
+
ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
|
827 |
+
} catch (e) {
|
828 |
+
console.error('Draw error:', e);
|
829 |
+
}
|
830 |
+
}
|
831 |
+
}
|
832 |
+
} catch (e) {
|
833 |
+
console.error('Canvas error:', e);
|
834 |
+
}
|
835 |
+
|
836 |
+
// Add the canvas to the PDF
|
837 |
+
const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
|
838 |
+
|
839 |
+
// Calculate dimensions
|
840 |
+
const pageWidth = pdf.internal.pageSize.getWidth();
|
841 |
+
const pageHeight = pdf.internal.pageSize.getHeight();
|
842 |
+
const imgRatio = pageCanvas.height / pageCanvas.width;
|
843 |
+
|
844 |
+
let imgWidth = pageWidth - 10;
|
845 |
+
let imgHeight = imgWidth * imgRatio;
|
846 |
+
|
847 |
+
if (imgHeight > pageHeight - 10) {
|
848 |
+
imgHeight = pageHeight - 10;
|
849 |
+
imgWidth = imgHeight / imgRatio;
|
850 |
+
}
|
851 |
+
|
852 |
+
// Center on page
|
853 |
+
const x = (pageWidth - imgWidth) / 2;
|
854 |
+
const y = (pageHeight - imgHeight) / 2;
|
855 |
+
|
856 |
+
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
857 |
+
} else {
|
858 |
+
// Use the found image
|
859 |
+
const canvas = document.createElement('canvas');
|
860 |
+
canvas.width = targetImage.naturalWidth || targetImage.width;
|
861 |
+
canvas.height = targetImage.naturalHeight || targetImage.height;
|
862 |
+
const ctx = canvas.getContext('2d');
|
863 |
+
|
864 |
+
// Draw image to canvas
|
865 |
+
try {
|
866 |
+
ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
|
867 |
+
} catch (e) {
|
868 |
+
console.error('Error drawing image:', e);
|
869 |
+
continue;
|
870 |
+
}
|
871 |
+
|
872 |
+
// Add to PDF
|
873 |
+
const imgData = canvas.toDataURL('image/jpeg', 0.95);
|
874 |
+
|
875 |
+
// Calculate dimensions
|
876 |
+
const pageWidth = pdf.internal.pageSize.getWidth();
|
877 |
+
const pageHeight = pdf.internal.pageSize.getHeight();
|
878 |
+
const imgRatio = canvas.height / canvas.width;
|
879 |
+
|
880 |
+
let imgWidth = pageWidth - 10;
|
881 |
+
let imgHeight = imgWidth * imgRatio;
|
882 |
+
|
883 |
+
if (imgHeight > pageHeight - 10) {
|
884 |
+
imgHeight = pageHeight - 10;
|
885 |
+
imgWidth = imgHeight / imgRatio;
|
886 |
+
}
|
887 |
+
|
888 |
+
// Center on page
|
889 |
+
const x = (pageWidth - imgWidth) / 2;
|
890 |
+
const y = (pageHeight - imgHeight) / 2;
|
891 |
+
|
892 |
+
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
893 |
+
}
|
894 |
+
}
|
895 |
+
}
|
896 |
+
|
897 |
+
// Return as base64
|
898 |
+
return pdf.output('datauristring');
|
899 |
+
} catch (e) {
|
900 |
+
console.error('PDF creation error:', e);
|
901 |
+
return null;
|
902 |
+
}
|
903 |
}
|
904 |
+
return createPDF();
|
905 |
+
""")
|
906 |
+
|
907 |
+
if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
|
908 |
+
# If script method failed, try screenshot approach
|
909 |
+
logger.warning("PDF creation script failed, trying fallback method")
|
910 |
+
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
911 |
+
|
912 |
+
# Save the PDF from base64
|
913 |
+
try:
|
914 |
+
base64_data = pdf_base64.replace('data:application/pdf;base64,', '')
|
915 |
+
pdf_bytes = base64.b64decode(base64_data)
|
916 |
|
917 |
+
with open(save_path, 'wb') as f:
|
918 |
+
f.write(pdf_bytes)
|
|
|
|
|
|
|
|
|
|
|
919 |
|
920 |
+
# Verify file is not empty
|
921 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
|
922 |
+
logger.info(f"Successfully saved PDF to {save_path}")
|
923 |
+
return True
|
924 |
+
else:
|
925 |
+
logger.warning(f"Generated PDF is too small, using fallback method")
|
926 |
+
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
927 |
+
except Exception as e:
|
928 |
+
logger.error(f"Error saving PDF: {e}")
|
929 |
+
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
930 |
+
|
931 |
+
except Exception as e:
|
932 |
+
logger.error(f"Error in view-only PDF download: {e}")
|
933 |
+
# Try fallback method
|
934 |
+
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
935 |
+
|
936 |
+
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
|
937 |
+
"""Download any view-only file by taking screenshots"""
|
938 |
+
try:
|
939 |
+
async with self.context.new_page() as page:
|
940 |
+
# Set high-resolution viewport
|
941 |
+
await page.set_viewport_size({"width": 1600, "height": 1200})
|
942 |
+
|
943 |
+
# Navigate to the file
|
944 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
|
945 |
+
|
946 |
+
# Make sure the file is loaded
|
947 |
+
await page.wait_for_load_state('networkidle')
|
948 |
+
await page.wait_for_timeout(3000) # Extra time for rendering
|
949 |
+
|
950 |
+
# Create directory for screenshots if multiple pages
|
951 |
+
base_dir = os.path.dirname(save_path)
|
952 |
+
base_name = os.path.splitext(os.path.basename(save_path))[0]
|
953 |
+
screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
|
954 |
+
os.makedirs(screenshots_dir, exist_ok=True)
|
955 |
+
|
956 |
+
# Check if it's a multi-page document
|
957 |
+
is_multi_page = await page.evaluate("""
|
958 |
+
() => {
|
959 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
960 |
+
return pages.length > 1;
|
961 |
}
|
962 |
+
""")
|
963 |
+
|
964 |
+
if is_multi_page and file_type == 'pdf':
|
965 |
+
# For multi-page PDFs, take screenshots of each page
|
966 |
+
page_count = await page.evaluate("""
|
967 |
+
async () => {
|
968 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
969 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
970 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
971 |
+
|
972 |
+
if (!container || pages.length === 0) return 0;
|
973 |
+
|
974 |
+
// Scroll through to make sure all pages are loaded
|
975 |
+
const scrollHeight = container.scrollHeight;
|
976 |
+
const viewportHeight = container.clientHeight;
|
977 |
+
const scrollStep = viewportHeight;
|
978 |
+
|
979 |
+
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
|
980 |
+
container.scrollTo(0, scrollPos);
|
981 |
+
await delay(300);
|
982 |
+
}
|
983 |
+
|
984 |
+
// Scroll back to top
|
985 |
+
container.scrollTo(0, 0);
|
986 |
+
await delay(300);
|
987 |
+
|
988 |
+
return pages.length;
|
989 |
}
|
990 |
+
""")
|
991 |
+
|
992 |
+
logger.info(f"Found {page_count} pages in document")
|
993 |
+
|
994 |
+
# Take screenshots of each page
|
995 |
+
screenshots = []
|
996 |
+
for i in range(page_count):
|
997 |
+
# Scroll to page
|
998 |
+
await page.evaluate(f"""
|
999 |
+
async () => {{
|
1000 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
1001 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
1002 |
+
if (pages.length <= {i}) return false;
|
1003 |
+
|
1004 |
+
pages[{i}].scrollIntoView();
|
1005 |
+
await delay(500);
|
1006 |
+
return true;
|
1007 |
+
}}
|
1008 |
+
""")
|
1009 |
|
1010 |
+
# Take screenshot
|
1011 |
+
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
|
1012 |
+
await page.screenshot(path=screenshot_path, clip={
|
1013 |
+
'x': 0,
|
1014 |
+
'y': 0,
|
1015 |
+
'width': 1600,
|
1016 |
+
'height': 1200
|
1017 |
+
})
|
1018 |
+
screenshots.append(screenshot_path)
|
1019 |
+
|
1020 |
+
# Combine screenshots into PDF
|
1021 |
+
from PIL import Image
|
1022 |
+
from reportlab.pdfgen import canvas
|
1023 |
+
|
1024 |
+
c = canvas.Canvas(save_path)
|
1025 |
+
for screenshot in screenshots:
|
1026 |
+
img = Image.open(screenshot)
|
1027 |
+
width, height = img.size
|
1028 |
|
1029 |
+
# Add page to PDF
|
1030 |
+
c.setPageSize((width, height))
|
1031 |
+
c.drawImage(screenshot, 0, 0, width, height)
|
1032 |
+
c.showPage()
|
1033 |
+
|
1034 |
+
c.save()
|
1035 |
+
|
1036 |
+
# Clean up screenshots
|
1037 |
+
for screenshot in screenshots:
|
1038 |
+
os.remove(screenshot)
|
1039 |
+
os.rmdir(screenshots_dir)
|
1040 |
+
|
1041 |
+
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
|
1042 |
+
else:
|
1043 |
+
# For single-page or non-PDF files, just take one screenshot
|
1044 |
+
screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
|
1045 |
+
await page.screenshot(path=screenshot_path, fullPage=True)
|
1046 |
+
|
1047 |
+
# Convert to requested format if needed
|
1048 |
+
if file_type == 'pdf':
|
1049 |
+
from PIL import Image
|
1050 |
+
from reportlab.pdfgen import canvas
|
1051 |
|
1052 |
+
# Create PDF from screenshot
|
1053 |
+
img = Image.open(screenshot_path)
|
1054 |
+
width, height = img.size
|
1055 |
|
1056 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
1057 |
+
c.drawImage(screenshot_path, 0, 0, width, height)
|
1058 |
+
c.save()
|
1059 |
+
else:
|
1060 |
+
# Just copy the screenshot to the destination with proper extension
|
1061 |
+
shutil.copy(screenshot_path, save_path)
|
1062 |
|
1063 |
+
# Clean up
|
1064 |
+
os.remove(screenshot_path)
|
1065 |
+
os.rmdir(screenshots_dir)
|
1066 |
+
|
1067 |
+
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
|
1068 |
+
|
1069 |
+
except Exception as e:
|
1070 |
+
logger.error(f"Error taking screenshots: {e}")
|
1071 |
+
return False
|
1072 |
+
|
1073 |
+
async def export_google_doc(self, file_id, file_type, save_path):
|
1074 |
+
"""Export Google Docs/Sheets/Slides to downloadable formats"""
|
1075 |
+
try:
|
1076 |
+
# Map file types to export formats
|
1077 |
+
export_formats = {
|
1078 |
+
'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
|
1079 |
+
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
1080 |
+
'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
|
1081 |
+
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
1082 |
+
'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
|
1083 |
+
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
1084 |
+
'pdf': 'application/pdf',
|
1085 |
+
}
|
1086 |
|
1087 |
+
export_format = export_formats.get(file_type, 'application/pdf')
|
1088 |
+
export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
|
|
|
1089 |
|
1090 |
+
if 'sheet' in file_type or 'xlsx' in file_type:
|
1091 |
+
export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
|
1092 |
+
elif 'ppt' in file_type or 'presentation' in file_type:
|
1093 |
+
export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
|
1094 |
+
elif file_type == 'pdf':
|
1095 |
+
export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
|
1096 |
|
1097 |
+
async with self.context.new_page() as page:
|
1098 |
+
# Get cookies from the main view page first
|
1099 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
|
1100 |
+
|
1101 |
+
# Now try the export
|
1102 |
+
response = await page.goto(export_url, wait_until='networkidle')
|
1103 |
+
|
1104 |
+
if response.status == 200:
|
1105 |
+
content = await response.body()
|
1106 |
+
with open(save_path, 'wb') as f:
|
1107 |
+
f.write(content)
|
1108 |
+
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
|
1109 |
+
else:
|
1110 |
+
logger.warning(f"Export failed with status {response.status}")
|
1111 |
+
return False
|
1112 |
+
|
1113 |
except Exception as e:
|
1114 |
+
logger.error(f"Error exporting Google Doc: {e}")
|
1115 |
return False
|
1116 |
|
1117 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|