euler314 commited on
Commit
b9e60db
·
verified ·
1 Parent(s): 0f88c1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +644 -212
app.py CHANGED
@@ -38,6 +38,10 @@ from reportlab.pdfgen import canvas
38
  from sklearn.cluster import KMeans
39
  import numpy as np
40
  import base64
 
 
 
 
41
  # -------------------- Logging Setup --------------------
42
  logging.basicConfig(
43
  filename='advanced_download_log.txt',
@@ -348,23 +352,29 @@ class DownloadManager:
348
  file_id = match.group(1)
349
  break
350
  if file_id:
351
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
352
- filename = file_id
353
  try:
354
- response = await self.page.request.head(direct_url, timeout=15000)
355
- cd = response.headers.get("Content-Disposition", "")
356
- if cd:
357
- mt = re.search(r'filename\*?="?([^";]+)', cd)
358
- if mt:
359
- filename = mt.group(1).strip('"').strip()
360
  found_files.append({
361
- 'url': direct_url,
362
  'filename': filename,
363
- 'size': await self.get_file_size(direct_url),
364
- 'metadata': {}
365
  })
366
  except Exception as e:
367
  logger.error(f"Error processing Google Drive link: {e}")
 
 
 
 
 
 
 
368
 
369
  seen_urls = set()
370
  unique_files = []
@@ -388,6 +398,7 @@ class DownloadManager:
388
  counter += 1
389
  os.makedirs(save_dir, exist_ok=True)
390
  try:
 
391
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
392
  # Use enhanced Google Drive downloader
393
  success = await self.download_from_google_drive(file_url, path)
@@ -435,251 +446,672 @@ class DownloadManager:
435
  logger.error(f"Could not extract file ID from URL: {url}")
436
  return False
437
 
438
- # Approach 1: Try with gdown first (when it works)
439
- try:
440
- import gdown
441
- output = gdown.download(url, save_path, quiet=False, fuzzy=True)
442
- if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
443
- logger.info(f"Successfully downloaded with gdown: {url}")
444
- return True
445
- except Exception as e:
446
- logger.warning(f"gdown download failed: {e}")
447
 
448
- # Approach 2: Use Playwright session with cookies
449
- try:
450
- async with self.context.new_page() as page:
451
- # Visit the file viewing page to get cookies
452
- view_url = f"https://drive.google.com/file/d/{file_id}/view"
453
- await page.goto(view_url, wait_until='networkidle', timeout=60000)
454
-
455
- # Check for view-only permissions
456
- if await page.query_selector('text="the owner has not granted you permission to download this file"'):
457
- logger.warning("File has view-only permissions, attempting workaround")
458
-
459
- # Check if it's a PDF (we can use the JS method)
460
- is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
461
- if is_pdf:
462
- # Try JavaScript PDF capture approach for PDFs
463
- success = await self.download_viewonly_pdf_with_js(page, save_path)
464
- if success:
465
- return True
466
-
467
- # Try direct download attempt for view-only files
468
- cookies = await page.context.cookies()
469
- cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
470
-
471
- # Try download URL with custom headers and cookies
472
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
473
- await page.goto(download_url, wait_until='networkidle', timeout=60000)
474
-
475
- headers = {
476
- 'User-Agent': get_random_user_agent(),
477
- 'Cookie': cookie_str,
478
- 'Accept': '*/*',
479
- }
480
-
481
- response = await page.request.get(download_url, headers=headers)
482
- if response.status == 200:
483
- content = await response.body()
484
- with open(save_path, 'wb') as f:
485
- f.write(content)
486
- return True
487
-
488
- # Standard download flow for files with download permission
489
- download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
490
- await page.goto(download_url, wait_until='networkidle', timeout=60000)
491
-
492
- # Handle large files with confirmation
493
- confirm_form = await page.query_selector('form#download-form')
494
- if confirm_form:
495
- await confirm_form.evaluate('form => form.submit()')
496
- await page.wait_for_load_state('networkidle')
497
-
498
- # Get cookies after confirmation
499
- cookies = await page.context.cookies()
500
- cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
501
 
502
- # Get final download URL with confirmation token
503
- download_url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
 
 
 
504
 
505
- response = await page.request.get(download_url, headers={'Cookie': cookie_str})
506
- if response.status == 200:
507
- content = await response.body()
508
- with open(save_path, 'wb') as f:
509
- f.write(content)
 
 
 
 
 
 
 
 
 
 
510
  return True
511
  except Exception as e:
512
- logger.warning(f"Playwright download approach failed: {e}")
513
 
514
- # Approach 3: Try with requests and session cookies
515
  try:
516
- import requests
517
-
518
  session = requests.Session()
519
  session.headers.update({'User-Agent': get_random_user_agent()})
520
 
521
- # Get the initial page to obtain cookies
 
 
 
522
  url = f"https://drive.google.com/uc?id={file_id}&export=download"
523
  response = session.get(url, stream=True, timeout=30)
524
 
525
- # Check for the download confirmation
526
  confirmation_token = None
527
  for k, v in response.cookies.items():
528
  if k.startswith('download_warning'):
529
  confirmation_token = v
530
  break
531
 
532
- # Use the confirmation token if found
533
  if confirmation_token:
534
- url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm={confirmation_token}"
535
-
536
- # Download the file
537
- response = session.get(url, stream=True, timeout=60)
538
- with open(save_path, 'wb') as f:
539
- for chunk in response.iter_content(chunk_size=1024*1024):
540
- if chunk:
541
- f.write(chunk)
542
 
543
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
544
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  except Exception as e:
546
  logger.warning(f"Requests session download failed: {e}")
547
 
548
- # All approaches failed
549
- logger.error(f"All download attempts failed for: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  return False
551
 
552
- async def download_viewonly_pdf_with_js(self, page, save_path):
553
- """Use JavaScript approach to download view-only PDFs from Google Drive"""
 
 
 
554
  try:
555
- logger.info("Attempting to download view-only PDF using JavaScript method")
556
-
557
- # Scroll to ensure all pages are loaded
558
- await page.evaluate("""
559
- async function scrollToBottom() {
560
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
561
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
562
- if (!container) return;
563
-
564
- const scrollHeight = container.scrollHeight;
565
- const viewportHeight = container.clientHeight;
566
- const scrollStep = viewportHeight / 2;
567
-
568
- for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
569
- container.scrollTo(0, scrollPos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  await delay(500);
 
 
571
  }
572
-
573
- // Final scroll to ensure we reached the bottom
574
- container.scrollTo(0, scrollHeight);
575
- await delay(1000);
576
- }
577
 
578
- return scrollToBottom();
579
- """)
580
-
581
- # Wait for a moment to ensure all images are loaded
582
- await page.wait_for_timeout(3000)
583
-
584
- # Inject the jsPDF library
585
- await page.evaluate("""
586
- return new Promise((resolve, reject) => {
587
- const script = document.createElement('script');
588
- script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
589
- script.onload = () => resolve(true);
590
- script.onerror = () => reject(new Error('Failed to load jsPDF'));
591
- document.head.appendChild(script);
592
- });
593
- """)
594
-
595
- # Wait for the library to load
596
- await page.wait_for_timeout(1000)
597
-
598
- # Execute the PDF creation script
599
- pdf_data = await page.evaluate("""
600
- return new Promise(async (resolve) => {
601
- // Make sure jsPDF is loaded
602
- if (typeof window.jspdf === 'undefined') {
603
- window.jspdf = window.jspdf || {};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  }
 
 
 
 
 
 
 
 
 
 
 
 
605
 
606
- // Use the jsPDF library
607
- const { jsPDF } = window.jspdf;
608
- const pdf = new jsPDF();
609
-
610
- const images = Array.from(document.querySelectorAll('img')).filter(img =>
611
- img.src.startsWith('blob:') && img.width > 100 && img.height > 100
612
- );
613
 
614
- if (images.length === 0) {
615
- resolve(null);
616
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  }
618
-
619
- for (let i = 0; i < images.length; i++) {
620
- const img = images[i];
621
-
622
- // Create canvas and draw image
623
- const canvas = document.createElement('canvas');
624
- canvas.width = img.width;
625
- canvas.height = img.height;
626
- const ctx = canvas.getContext('2d');
627
- ctx.drawImage(img, 0, 0, img.width, img.height);
628
-
629
- // Add image to PDF
630
- const imgData = canvas.toDataURL('image/jpeg', 1.0);
631
-
632
- // Add a new page for each image except the first one
633
- if (i > 0) {
634
- pdf.addPage();
 
 
 
 
 
 
 
 
 
 
635
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
- // Calculate dimensions to fit page
638
- const pageWidth = pdf.internal.pageSize.getWidth();
639
- const pageHeight = pdf.internal.pageSize.getHeight();
640
- const imgRatio = img.height / img.width;
641
-
642
- let imgWidth = pageWidth;
643
- let imgHeight = imgWidth * imgRatio;
 
 
 
 
 
 
 
 
 
 
 
644
 
645
- // If height exceeds page, scale down
646
- if (imgHeight > pageHeight) {
647
- imgHeight = pageHeight;
648
- imgWidth = imgHeight / imgRatio;
649
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
 
651
- // Center image on page
652
- const x = (pageWidth - imgWidth) / 2;
653
- const y = (pageHeight - imgHeight) / 2;
654
 
655
- pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
656
- }
 
 
 
 
657
 
658
- // Get the PDF as base64
659
- const pdfBase64 = pdf.output('datauristring');
660
- resolve(pdfBase64);
661
- });
662
- """)
663
-
664
- if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
665
- logger.warning("Failed to generate PDF with JavaScript method")
666
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
 
668
- # Extract the base64 data and save to file
669
- base64_data = pdf_data.replace('data:application/pdf;base64,', '')
670
- pdf_bytes = base64.b64decode(base64_data)
671
 
672
- with open(save_path, 'wb') as f:
673
- f.write(pdf_bytes)
 
 
 
 
674
 
675
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
676
- logger.info("Successfully downloaded view-only PDF using JavaScript method")
677
- return True
678
- else:
679
- return False
680
-
 
 
 
 
 
 
 
 
 
 
681
  except Exception as e:
682
- logger.error(f"Error in JavaScript PDF download method: {e}")
683
  return False
684
 
685
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
 
38
  from sklearn.cluster import KMeans
39
  import numpy as np
40
  import base64
41
+ import shutil
42
+ from PIL import Image # Make sure to pip install Pillow
43
+ from reportlab.pdfgen import canvas
44
+
45
  # -------------------- Logging Setup --------------------
46
  logging.basicConfig(
47
  filename='advanced_download_log.txt',
 
352
  file_id = match.group(1)
353
  break
354
  if file_id:
355
+ # We'll detect file type during download, so just use the ID for filename initially
356
+ filename = f"gdrive_{file_id}"
357
  try:
358
+ # Get file info to determine type and size
359
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
360
+ if file_type:
361
+ filename = f"{filename}.{file_type}"
362
+
 
363
  found_files.append({
364
+ 'url': href, # Use original URL, as we'll process it specially
365
  'filename': filename,
366
+ 'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"),
367
+ 'metadata': {'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id}
368
  })
369
  except Exception as e:
370
  logger.error(f"Error processing Google Drive link: {e}")
371
+ # Fallback if we can't get info
372
+ found_files.append({
373
+ 'url': href,
374
+ 'filename': filename,
375
+ 'size': "Unknown Size",
376
+ 'metadata': {'file_id': file_id}
377
+ })
378
 
379
  seen_urls = set()
380
  unique_files = []
 
398
  counter += 1
399
  os.makedirs(save_dir, exist_ok=True)
400
  try:
401
+ # Special handling for Google Drive files
402
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
403
  # Use enhanced Google Drive downloader
404
  success = await self.download_from_google_drive(file_url, path)
 
446
  logger.error(f"Could not extract file ID from URL: {url}")
447
  return False
448
 
449
+ # Determine file type first (important for handling different file types)
450
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
451
+ logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
 
 
 
 
 
 
452
 
453
+ base, ext = os.path.splitext(save_path)
454
+ if not ext and file_type:
455
+ # Add the correct extension if missing
456
+ save_path = f"{base}.{file_type}"
457
+
458
+ # For view-only files, use specialized approaches
459
+ if is_view_only:
460
+ # Approach 1: For PDFs, use the JS method
461
+ if file_type == 'pdf':
462
+ success = await self.download_viewonly_pdf_with_js(file_id, save_path)
463
+ if success:
464
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
+ # Approach 2: For Google Docs, Sheets, etc., use export API
467
+ if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
468
+ success = await self.export_google_doc(file_id, file_type, save_path)
469
+ if success:
470
+ return True
471
 
472
+ # Approach 3: Try the direct screenshot method for any view-only file
473
+ success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
474
+ if success:
475
+ return True
476
+
477
+ # Try standard approaches for non-view-only files
478
+ try:
479
+ # Try with gdown first
480
+ import gdown
481
+ output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
482
+ if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
483
+ with open(save_path, 'rb') as f:
484
+ content = f.read(100) # Read first 100 bytes
485
+ if b'<!DOCTYPE html>' not in content: # Check not HTML error page
486
+ logger.info(f"Successfully downloaded with gdown: {url}")
487
  return True
488
  except Exception as e:
489
+ logger.warning(f"gdown download failed: {e}")
490
 
491
+ # Try with requests and session cookies
492
  try:
 
 
493
  session = requests.Session()
494
  session.headers.update({'User-Agent': get_random_user_agent()})
495
 
496
+ # Visit the page first to get cookies
497
+ session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
498
+
499
+ # Try download
500
  url = f"https://drive.google.com/uc?id={file_id}&export=download"
501
  response = session.get(url, stream=True, timeout=30)
502
 
503
+ # Check for confirmation token
504
  confirmation_token = None
505
  for k, v in response.cookies.items():
506
  if k.startswith('download_warning'):
507
  confirmation_token = v
508
  break
509
 
510
+ # Use confirmation token if found
511
  if confirmation_token:
512
+ url = f"{url}&confirm={confirmation_token}"
513
+ response = session.get(url, stream=True, timeout=60)
 
 
 
 
 
 
514
 
515
+ # Check if we're getting HTML instead of the file
516
+ content_type = response.headers.get('Content-Type', '')
517
+ if 'text/html' in content_type:
518
+ logger.warning("Received HTML instead of file - likely download restriction")
519
+ else:
520
+ with open(save_path, 'wb') as f:
521
+ for chunk in response.iter_content(chunk_size=1024*1024):
522
+ if chunk:
523
+ f.write(chunk)
524
+
525
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
526
+ with open(save_path, 'rb') as f:
527
+ content = f.read(100)
528
+ if b'<!DOCTYPE html>' not in content:
529
+ logger.info("Successfully downloaded with requests session")
530
+ return True
531
  except Exception as e:
532
  logger.warning(f"Requests session download failed: {e}")
533
 
534
+ # If all methods failed for view-only file, try one last approach
535
+ if is_view_only:
536
+ try:
537
+ # Try a direct headless browser download
538
+ async with self.context.new_page() as page:
539
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
540
+
541
+ # Try to capture the content directly from viewer
542
+ file_content = await page.evaluate("""
543
+ () => {
544
+ // Try to find the actual viewer content
545
+ const viewerContent = document.querySelector('.drive-viewer-paginated-content');
546
+ if (viewerContent) {
547
+ return viewerContent.innerHTML;
548
+ }
549
+ return document.documentElement.innerHTML;
550
+ }
551
+ """)
552
+
553
+ if file_content:
554
+ # Save as HTML and then we can convert it if needed
555
+ html_path = f"{base}.html"
556
+ with open(html_path, 'w', encoding='utf-8') as f:
557
+ f.write(f"""
558
+ <!DOCTYPE html>
559
+ <html>
560
+ <head><title>Google Drive Extracted Content</title></head>
561
+ <body>
562
+ {file_content}
563
+ </body>
564
+ </html>
565
+ """)
566
+
567
+ # If requested a PDF, convert HTML to PDF
568
+ if file_type == 'pdf' or ext.lower() == '.pdf':
569
+ try:
570
+ import pdfkit
571
+ pdfkit.from_file(html_path, save_path)
572
+ os.remove(html_path) # Clean up HTML file
573
+ return True
574
+ except Exception as pdf_err:
575
+ logger.warning(f"Error converting HTML to PDF: {pdf_err}")
576
+ # Keep the HTML file as fallback
577
+ shutil.copy(html_path, save_path)
578
+ return True
579
+ else:
580
+ # Just use the HTML file
581
+ shutil.copy(html_path, save_path)
582
+ return True
583
+ except Exception as e:
584
+ logger.warning(f"Final direct browser capture failed: {e}")
585
+
586
+ # All methods failed
587
+ logger.error(f"All download approaches failed for Google Drive file: {file_id}")
588
  return False
589
 
590
+ async def get_google_drive_file_info(self, file_id):
591
+ """Get file type and view-only status from Google Drive"""
592
+ file_type = None
593
+ is_view_only = False
594
+
595
  try:
596
+ async with self.context.new_page() as page:
597
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
598
+
599
+ # Check if view-only
600
+ view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
601
+ is_view_only = view_only_text is not None
602
+
603
+ # Check for Google Docs viewer
604
+ gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
605
+ gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
606
+ gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
607
+
608
+ if gdocs_viewer:
609
+ file_type = 'docx'
610
+ elif gsheets_viewer:
611
+ file_type = 'xlsx'
612
+ elif gslides_viewer:
613
+ file_type = 'pptx'
614
+ else:
615
+ # Check for PDF viewer
616
+ pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
617
+ if pdf_viewer:
618
+ file_type = 'pdf'
619
+ else:
620
+ # Check for image viewer
621
+ img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
622
+ if img_viewer:
623
+ # Get image type from src
624
+ img_src = await img_viewer.get_attribute('src')
625
+ if 'jpg' in img_src or 'jpeg' in img_src:
626
+ file_type = 'jpg'
627
+ elif 'png' in img_src:
628
+ file_type = 'png'
629
+ else:
630
+ file_type = 'jpg' # Default to jpg
631
+ else:
632
+ # Generic file type fallback
633
+ file_type = 'pdf' # Default to PDF
634
+
635
+ # If still no type, check filename
636
+ if not file_type:
637
+ title_element = await page.query_selector('div[role="heading"]')
638
+ if title_element:
639
+ title = await title_element.text_content()
640
+ if title:
641
+ ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
642
+ if ext_match:
643
+ file_type = ext_match.group(1).lower()
644
+
645
+ except Exception as e:
646
+ logger.error(f"Error getting Google Drive file info: {e}")
647
+ file_type = 'pdf' # Default to PDF if we can't determine
648
+
649
+ return file_type, is_view_only
650
+
651
+ async def download_viewonly_pdf_with_js(self, file_id, save_path):
652
+ """Download view-only PDF using JavaScript approach - improved version"""
653
+ try:
654
+ async with self.context.new_page() as page:
655
+ # Set viewport size to ensure we capture full pages
656
+ await page.set_viewport_size({"width": 1200, "height": 1600})
657
+
658
+ # Visit the file
659
+ view_url = f"https://drive.google.com/file/d/{file_id}/view"
660
+ await page.goto(view_url, wait_until='networkidle', timeout=60000)
661
+
662
+ # Wait for rendering
663
+ await page.wait_for_timeout(2000)
664
+
665
+ # Inject required libraries - use CDN for jsPDF
666
+ await page.evaluate("""
667
+ async function injectLibraries() {
668
+ // Add jsPDF
669
+ return new Promise((resolve) => {
670
+ const jspdfScript = document.createElement('script');
671
+ jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
672
+ jspdfScript.onload = () => resolve(true);
673
+ document.head.appendChild(jspdfScript);
674
+ });
675
+ }
676
+ return injectLibraries();
677
+ """)
678
+
679
+ # Wait for libraries to load
680
+ await page.wait_for_timeout(2000)
681
+
682
+ # Scroll through document to load all pages
683
+ await page.evaluate("""
684
+ async function scrollThroughDocument() {
685
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
686
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
687
+ if (!container) return false;
688
+
689
+ const scrollHeight = container.scrollHeight;
690
+ const viewportHeight = container.clientHeight;
691
+ const scrollStep = viewportHeight / 2;
692
+
693
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
694
+ container.scrollTo(0, scrollPos);
695
+ await delay(500);
696
+ }
697
+
698
+ // One final scroll to bottom to ensure everything is loaded
699
+ container.scrollTo(0, scrollHeight);
700
+ await delay(1000);
701
+
702
+ // Scroll back to top for PDF creation
703
+ container.scrollTo(0, 0);
704
  await delay(500);
705
+
706
+ return true;
707
  }
708
+ return scrollThroughDocument();
709
+ """)
 
 
 
710
 
711
+ # Wait after scrolling
712
+ await page.wait_for_timeout(2000)
713
+
714
+ # Use the improved PDF creation script that captures all pages
715
+ pdf_base64 = await page.evaluate("""
716
+ async function createPDF() {
717
+ try {
718
+ // Make sure jsPDF is loaded
719
+ if (typeof window.jspdf === 'undefined') {
720
+ console.error('jsPDF not loaded');
721
+ return null;
722
+ }
723
+
724
+ const { jsPDF } = window.jspdf;
725
+ const pdf = new jsPDF();
726
+
727
+ // Get all page elements
728
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
729
+ console.log('Found pages:', pages.length);
730
+
731
+ if (pages.length === 0) {
732
+ // Alternative: try to find images directly
733
+ const images = Array.from(document.querySelectorAll('img')).filter(img =>
734
+ img.src.startsWith('blob:') && img.width > 100 && img.height > 100
735
+ );
736
+
737
+ console.log('Found images:', images.length);
738
+
739
+ if (images.length === 0) {
740
+ return null;
741
+ }
742
+
743
+ // Process each image
744
+ for (let i = 0; i < images.length; i++) {
745
+ const img = images[i];
746
+
747
+ if (i > 0) {
748
+ pdf.addPage();
749
+ }
750
+
751
+ // Create canvas and draw image
752
+ const canvas = document.createElement('canvas');
753
+ canvas.width = img.width;
754
+ canvas.height = img.height;
755
+ const ctx = canvas.getContext('2d');
756
+ ctx.drawImage(img, 0, 0, img.width, img.height);
757
+
758
+ // Add to PDF
759
+ const imgData = canvas.toDataURL('image/jpeg', 0.95);
760
+
761
+ // Calculate dimensions
762
+ const pageWidth = pdf.internal.pageSize.getWidth();
763
+ const pageHeight = pdf.internal.pageSize.getHeight();
764
+ const imgRatio = img.height / img.width;
765
+
766
+ let imgWidth = pageWidth - 10;
767
+ let imgHeight = imgWidth * imgRatio;
768
+
769
+ if (imgHeight > pageHeight - 10) {
770
+ imgHeight = pageHeight - 10;
771
+ imgWidth = imgHeight / imgRatio;
772
+ }
773
+
774
+ // Center on page
775
+ const x = (pageWidth - imgWidth) / 2;
776
+ const y = (pageHeight - imgHeight) / 2;
777
+
778
+ pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
779
+ }
780
+ } else {
781
+ // Process each page
782
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
783
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
784
+
785
+ for (let i = 0; i < pages.length; i++) {
786
+ // Add a new page for each page after the first
787
+ if (i > 0) {
788
+ pdf.addPage();
789
+ }
790
+
791
+ // Scroll to the page and wait for it to render
792
+ pages[i].scrollIntoView();
793
+ await delay(300);
794
+
795
+ // Find the image element inside the page
796
+ const pageImages = pages[i].querySelectorAll('img');
797
+ let targetImage = null;
798
+
799
+ for (const img of pageImages) {
800
+ if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
801
+ targetImage = img;
802
+ break;
803
+ }
804
+ }
805
+
806
+ if (!targetImage) {
807
+ // If no image found, try taking a screenshot of the page instead
808
+ const pageCanvas = document.createElement('canvas');
809
+ pageCanvas.width = pages[i].clientWidth;
810
+ pageCanvas.height = pages[i].clientHeight;
811
+ const ctx = pageCanvas.getContext('2d');
812
+
813
+ // Draw the page background
814
+ ctx.fillStyle = 'white';
815
+ ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
816
+
817
+ // Use html2canvas approach
818
+ try {
819
+ await delay(100);
820
+ // Just draw what we can see
821
+ const allElements = pages[i].querySelectorAll('*');
822
+ for (const el of allElements) {
823
+ if (el.tagName === 'IMG' && el.complete && el.src) {
824
+ const rect = el.getBoundingClientRect();
825
+ try {
826
+ ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
827
+ } catch (e) {
828
+ console.error('Draw error:', e);
829
+ }
830
+ }
831
+ }
832
+ } catch (e) {
833
+ console.error('Canvas error:', e);
834
+ }
835
+
836
+ // Add the canvas to the PDF
837
+ const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
838
+
839
+ // Calculate dimensions
840
+ const pageWidth = pdf.internal.pageSize.getWidth();
841
+ const pageHeight = pdf.internal.pageSize.getHeight();
842
+ const imgRatio = pageCanvas.height / pageCanvas.width;
843
+
844
+ let imgWidth = pageWidth - 10;
845
+ let imgHeight = imgWidth * imgRatio;
846
+
847
+ if (imgHeight > pageHeight - 10) {
848
+ imgHeight = pageHeight - 10;
849
+ imgWidth = imgHeight / imgRatio;
850
+ }
851
+
852
+ // Center on page
853
+ const x = (pageWidth - imgWidth) / 2;
854
+ const y = (pageHeight - imgHeight) / 2;
855
+
856
+ pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
857
+ } else {
858
+ // Use the found image
859
+ const canvas = document.createElement('canvas');
860
+ canvas.width = targetImage.naturalWidth || targetImage.width;
861
+ canvas.height = targetImage.naturalHeight || targetImage.height;
862
+ const ctx = canvas.getContext('2d');
863
+
864
+ // Draw image to canvas
865
+ try {
866
+ ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
867
+ } catch (e) {
868
+ console.error('Error drawing image:', e);
869
+ continue;
870
+ }
871
+
872
+ // Add to PDF
873
+ const imgData = canvas.toDataURL('image/jpeg', 0.95);
874
+
875
+ // Calculate dimensions
876
+ const pageWidth = pdf.internal.pageSize.getWidth();
877
+ const pageHeight = pdf.internal.pageSize.getHeight();
878
+ const imgRatio = canvas.height / canvas.width;
879
+
880
+ let imgWidth = pageWidth - 10;
881
+ let imgHeight = imgWidth * imgRatio;
882
+
883
+ if (imgHeight > pageHeight - 10) {
884
+ imgHeight = pageHeight - 10;
885
+ imgWidth = imgHeight / imgRatio;
886
+ }
887
+
888
+ // Center on page
889
+ const x = (pageWidth - imgWidth) / 2;
890
+ const y = (pageHeight - imgHeight) / 2;
891
+
892
+ pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
893
+ }
894
+ }
895
+ }
896
+
897
+ // Return as base64
898
+ return pdf.output('datauristring');
899
+ } catch (e) {
900
+ console.error('PDF creation error:', e);
901
+ return null;
902
+ }
903
  }
904
+ return createPDF();
905
+ """)
906
+
907
+ if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
908
+ # If script method failed, try screenshot approach
909
+ logger.warning("PDF creation script failed, trying fallback method")
910
+ return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
911
+
912
+ # Save the PDF from base64
913
+ try:
914
+ base64_data = pdf_base64.replace('data:application/pdf;base64,', '')
915
+ pdf_bytes = base64.b64decode(base64_data)
916
 
917
+ with open(save_path, 'wb') as f:
918
+ f.write(pdf_bytes)
 
 
 
 
 
919
 
920
+ # Verify file is not empty
921
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
922
+ logger.info(f"Successfully saved PDF to {save_path}")
923
+ return True
924
+ else:
925
+ logger.warning(f"Generated PDF is too small, using fallback method")
926
+ return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
927
+ except Exception as e:
928
+ logger.error(f"Error saving PDF: {e}")
929
+ return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
930
+
931
+ except Exception as e:
932
+ logger.error(f"Error in view-only PDF download: {e}")
933
+ # Try fallback method
934
+ return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
935
+
936
+ async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
937
+ """Download any view-only file by taking screenshots"""
938
+ try:
939
+ async with self.context.new_page() as page:
940
+ # Set high-resolution viewport
941
+ await page.set_viewport_size({"width": 1600, "height": 1200})
942
+
943
+ # Navigate to the file
944
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
945
+
946
+ # Make sure the file is loaded
947
+ await page.wait_for_load_state('networkidle')
948
+ await page.wait_for_timeout(3000) # Extra time for rendering
949
+
950
+ # Create directory for screenshots if multiple pages
951
+ base_dir = os.path.dirname(save_path)
952
+ base_name = os.path.splitext(os.path.basename(save_path))[0]
953
+ screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
954
+ os.makedirs(screenshots_dir, exist_ok=True)
955
+
956
+ # Check if it's a multi-page document
957
+ is_multi_page = await page.evaluate("""
958
+ () => {
959
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
960
+ return pages.length > 1;
961
  }
962
+ """)
963
+
964
+ if is_multi_page and file_type == 'pdf':
965
+ # For multi-page PDFs, take screenshots of each page
966
+ page_count = await page.evaluate("""
967
+ async () => {
968
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
969
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
970
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
971
+
972
+ if (!container || pages.length === 0) return 0;
973
+
974
+ // Scroll through to make sure all pages are loaded
975
+ const scrollHeight = container.scrollHeight;
976
+ const viewportHeight = container.clientHeight;
977
+ const scrollStep = viewportHeight;
978
+
979
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
980
+ container.scrollTo(0, scrollPos);
981
+ await delay(300);
982
+ }
983
+
984
+ // Scroll back to top
985
+ container.scrollTo(0, 0);
986
+ await delay(300);
987
+
988
+ return pages.length;
989
  }
990
+ """)
991
+
992
+ logger.info(f"Found {page_count} pages in document")
993
+
994
+ # Take screenshots of each page
995
+ screenshots = []
996
+ for i in range(page_count):
997
+ # Scroll to page
998
+ await page.evaluate(f"""
999
+ async () => {{
1000
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
1001
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1002
+ if (pages.length <= {i}) return false;
1003
+
1004
+ pages[{i}].scrollIntoView();
1005
+ await delay(500);
1006
+ return true;
1007
+ }}
1008
+ """)
1009
 
1010
+ # Take screenshot
1011
+ screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
1012
+ await page.screenshot(path=screenshot_path, clip={
1013
+ 'x': 0,
1014
+ 'y': 0,
1015
+ 'width': 1600,
1016
+ 'height': 1200
1017
+ })
1018
+ screenshots.append(screenshot_path)
1019
+
1020
+ # Combine screenshots into PDF
1021
+ from PIL import Image
1022
+ from reportlab.pdfgen import canvas
1023
+
1024
+ c = canvas.Canvas(save_path)
1025
+ for screenshot in screenshots:
1026
+ img = Image.open(screenshot)
1027
+ width, height = img.size
1028
 
1029
+ # Add page to PDF
1030
+ c.setPageSize((width, height))
1031
+ c.drawImage(screenshot, 0, 0, width, height)
1032
+ c.showPage()
1033
+
1034
+ c.save()
1035
+
1036
+ # Clean up screenshots
1037
+ for screenshot in screenshots:
1038
+ os.remove(screenshot)
1039
+ os.rmdir(screenshots_dir)
1040
+
1041
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1042
+ else:
1043
+ # For single-page or non-PDF files, just take one screenshot
1044
+ screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
1045
+ await page.screenshot(path=screenshot_path, fullPage=True)
1046
+
1047
+ # Convert to requested format if needed
1048
+ if file_type == 'pdf':
1049
+ from PIL import Image
1050
+ from reportlab.pdfgen import canvas
1051
 
1052
+ # Create PDF from screenshot
1053
+ img = Image.open(screenshot_path)
1054
+ width, height = img.size
1055
 
1056
+ c = canvas.Canvas(save_path, pagesize=(width, height))
1057
+ c.drawImage(screenshot_path, 0, 0, width, height)
1058
+ c.save()
1059
+ else:
1060
+ # Just copy the screenshot to the destination with proper extension
1061
+ shutil.copy(screenshot_path, save_path)
1062
 
1063
+ # Clean up
1064
+ os.remove(screenshot_path)
1065
+ os.rmdir(screenshots_dir)
1066
+
1067
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1068
+
1069
+ except Exception as e:
1070
+ logger.error(f"Error taking screenshots: {e}")
1071
+ return False
1072
+
1073
+ async def export_google_doc(self, file_id, file_type, save_path):
1074
+ """Export Google Docs/Sheets/Slides to downloadable formats"""
1075
+ try:
1076
+ # Map file types to export formats
1077
+ export_formats = {
1078
+ 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
1079
+ 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
1080
+ 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
1081
+ 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
1082
+ 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
1083
+ 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
1084
+ 'pdf': 'application/pdf',
1085
+ }
1086
 
1087
+ export_format = export_formats.get(file_type, 'application/pdf')
1088
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
 
1089
 
1090
+ if 'sheet' in file_type or 'xlsx' in file_type:
1091
+ export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
1092
+ elif 'ppt' in file_type or 'presentation' in file_type:
1093
+ export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
1094
+ elif file_type == 'pdf':
1095
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
1096
 
1097
+ async with self.context.new_page() as page:
1098
+ # Get cookies from the main view page first
1099
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
1100
+
1101
+ # Now try the export
1102
+ response = await page.goto(export_url, wait_until='networkidle')
1103
+
1104
+ if response.status == 200:
1105
+ content = await response.body()
1106
+ with open(save_path, 'wb') as f:
1107
+ f.write(content)
1108
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1109
+ else:
1110
+ logger.warning(f"Export failed with status {response.status}")
1111
+ return False
1112
+
1113
  except Exception as e:
1114
+ logger.error(f"Error exporting Google Doc: {e}")
1115
  return False
1116
 
1117
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):