euler314 commited on
Commit
0f88c1d
·
verified ·
1 Parent(s): 573acd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +276 -7
app.py CHANGED
@@ -37,7 +37,7 @@ from reportlab.lib.pagesizes import letter
37
  from reportlab.pdfgen import canvas
38
  from sklearn.cluster import KMeans
39
  import numpy as np
40
-
41
  # -------------------- Logging Setup --------------------
42
  logging.basicConfig(
43
  filename='advanced_download_log.txt',
@@ -388,12 +388,12 @@ class DownloadManager:
388
  counter += 1
389
  os.makedirs(save_dir, exist_ok=True)
390
  try:
391
- if "drive.google.com" in file_url:
392
- import gdown
393
- output = gdown.download(file_url, path, quiet=False)
394
- if output:
395
- return path
396
- return None
397
  async with self.context.new_page() as page:
398
  headers = {
399
  'Accept': '*/*',
@@ -413,6 +413,275 @@ class DownloadManager:
413
  logger.error(f"Error downloading {file_url}: {e}")
414
  return None
415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
417
  if not custom_ext_list:
418
  custom_ext_list = []
 
37
  from reportlab.pdfgen import canvas
38
  from sklearn.cluster import KMeans
39
  import numpy as np
40
+ import base64
41
  # -------------------- Logging Setup --------------------
42
  logging.basicConfig(
43
  filename='advanced_download_log.txt',
 
388
  counter += 1
389
  os.makedirs(save_dir, exist_ok=True)
390
  try:
391
+ if "drive.google.com" in file_url or "docs.google.com" in file_url:
392
+ # Use enhanced Google Drive downloader
393
+ success = await self.download_from_google_drive(file_url, path)
394
+ return path if success else None
395
+
396
+ # Original code for non-Google Drive downloads
397
  async with self.context.new_page() as page:
398
  headers = {
399
  'Accept': '*/*',
 
413
  logger.error(f"Error downloading {file_url}: {e}")
414
  return None
415
 
416
+ async def download_from_google_drive(self, url, save_path):
417
+ """Enhanced method to download from Google Drive with multiple fallback approaches"""
418
+ # Extract the file ID from different URL formats
419
+ file_id = None
420
+ url_patterns = [
421
+ r'drive\.google\.com/file/d/([^/]+)',
422
+ r'drive\.google\.com/open\?id=([^&]+)',
423
+ r'docs\.google\.com/\w+/d/([^/]+)',
424
+ r'id=([^&]+)',
425
+ r'drive\.google\.com/uc\?id=([^&]+)',
426
+ ]
427
+
428
+ for pattern in url_patterns:
429
+ match = re.search(pattern, url)
430
+ if match:
431
+ file_id = match.group(1)
432
+ break
433
+
434
+ if not file_id:
435
+ logger.error(f"Could not extract file ID from URL: {url}")
436
+ return False
437
+
438
+ # Approach 1: Try with gdown first (when it works)
439
+ try:
440
+ import gdown
441
+ output = gdown.download(url, save_path, quiet=False, fuzzy=True)
442
+ if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
443
+ logger.info(f"Successfully downloaded with gdown: {url}")
444
+ return True
445
+ except Exception as e:
446
+ logger.warning(f"gdown download failed: {e}")
447
+
448
+ # Approach 2: Use Playwright session with cookies
449
+ try:
450
+ async with self.context.new_page() as page:
451
+ # Visit the file viewing page to get cookies
452
+ view_url = f"https://drive.google.com/file/d/{file_id}/view"
453
+ await page.goto(view_url, wait_until='networkidle', timeout=60000)
454
+
455
+ # Check for view-only permissions
456
+ if await page.query_selector('text="the owner has not granted you permission to download this file"'):
457
+ logger.warning("File has view-only permissions, attempting workaround")
458
+
459
+ # Check if it's a PDF (we can use the JS method)
460
+ is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
461
+ if is_pdf:
462
+ # Try JavaScript PDF capture approach for PDFs
463
+ success = await self.download_viewonly_pdf_with_js(page, save_path)
464
+ if success:
465
+ return True
466
+
467
+ # Try direct download attempt for view-only files
468
+ cookies = await page.context.cookies()
469
+ cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
470
+
471
+ # Try download URL with custom headers and cookies
472
+ download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
473
+ await page.goto(download_url, wait_until='networkidle', timeout=60000)
474
+
475
+ headers = {
476
+ 'User-Agent': get_random_user_agent(),
477
+ 'Cookie': cookie_str,
478
+ 'Accept': '*/*',
479
+ }
480
+
481
+ response = await page.request.get(download_url, headers=headers)
482
+ if response.status == 200:
483
+ content = await response.body()
484
+ with open(save_path, 'wb') as f:
485
+ f.write(content)
486
+ return True
487
+
488
+ # Standard download flow for files with download permission
489
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
490
+ await page.goto(download_url, wait_until='networkidle', timeout=60000)
491
+
492
+ # Handle large files with confirmation
493
+ confirm_form = await page.query_selector('form#download-form')
494
+ if confirm_form:
495
+ await confirm_form.evaluate('form => form.submit()')
496
+ await page.wait_for_load_state('networkidle')
497
+
498
+ # Get cookies after confirmation
499
+ cookies = await page.context.cookies()
500
+ cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
501
+
502
+ # Get final download URL with confirmation token
503
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
504
+
505
+ response = await page.request.get(download_url, headers={'Cookie': cookie_str})
506
+ if response.status == 200:
507
+ content = await response.body()
508
+ with open(save_path, 'wb') as f:
509
+ f.write(content)
510
+ return True
511
+ except Exception as e:
512
+ logger.warning(f"Playwright download approach failed: {e}")
513
+
514
+ # Approach 3: Try with requests and session cookies
515
+ try:
516
+ import requests
517
+
518
+ session = requests.Session()
519
+ session.headers.update({'User-Agent': get_random_user_agent()})
520
+
521
+ # Get the initial page to obtain cookies
522
+ url = f"https://drive.google.com/uc?id={file_id}&export=download"
523
+ response = session.get(url, stream=True, timeout=30)
524
+
525
+ # Check for the download confirmation
526
+ confirmation_token = None
527
+ for k, v in response.cookies.items():
528
+ if k.startswith('download_warning'):
529
+ confirmation_token = v
530
+ break
531
+
532
+ # Use the confirmation token if found
533
+ if confirmation_token:
534
+ url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm={confirmation_token}"
535
+
536
+ # Download the file
537
+ response = session.get(url, stream=True, timeout=60)
538
+ with open(save_path, 'wb') as f:
539
+ for chunk in response.iter_content(chunk_size=1024*1024):
540
+ if chunk:
541
+ f.write(chunk)
542
+
543
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
544
+ return True
545
+ except Exception as e:
546
+ logger.warning(f"Requests session download failed: {e}")
547
+
548
+ # All approaches failed
549
+ logger.error(f"All download attempts failed for: {url}")
550
+ return False
551
+
552
+ async def download_viewonly_pdf_with_js(self, page, save_path):
553
+ """Use JavaScript approach to download view-only PDFs from Google Drive"""
554
+ try:
555
+ logger.info("Attempting to download view-only PDF using JavaScript method")
556
+
557
+ # Scroll to ensure all pages are loaded
558
+ await page.evaluate("""
559
+ async function scrollToBottom() {
560
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
561
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
562
+ if (!container) return;
563
+
564
+ const scrollHeight = container.scrollHeight;
565
+ const viewportHeight = container.clientHeight;
566
+ const scrollStep = viewportHeight / 2;
567
+
568
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
569
+ container.scrollTo(0, scrollPos);
570
+ await delay(500);
571
+ }
572
+
573
+ // Final scroll to ensure we reached the bottom
574
+ container.scrollTo(0, scrollHeight);
575
+ await delay(1000);
576
+ }
577
+
578
+ return scrollToBottom();
579
+ """)
580
+
581
+ # Wait for a moment to ensure all images are loaded
582
+ await page.wait_for_timeout(3000)
583
+
584
+ # Inject the jsPDF library
585
+ await page.evaluate("""
586
+ return new Promise((resolve, reject) => {
587
+ const script = document.createElement('script');
588
+ script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
589
+ script.onload = () => resolve(true);
590
+ script.onerror = () => reject(new Error('Failed to load jsPDF'));
591
+ document.head.appendChild(script);
592
+ });
593
+ """)
594
+
595
+ # Wait for the library to load
596
+ await page.wait_for_timeout(1000)
597
+
598
+ # Execute the PDF creation script
599
+ pdf_data = await page.evaluate("""
600
+ return new Promise(async (resolve) => {
601
+ // Make sure jsPDF is loaded
602
+ if (typeof window.jspdf === 'undefined') {
603
+ window.jspdf = window.jspdf || {};
604
+ }
605
+
606
+ // Use the jsPDF library
607
+ const { jsPDF } = window.jspdf;
608
+ const pdf = new jsPDF();
609
+
610
+ const images = Array.from(document.querySelectorAll('img')).filter(img =>
611
+ img.src.startsWith('blob:') && img.width > 100 && img.height > 100
612
+ );
613
+
614
+ if (images.length === 0) {
615
+ resolve(null);
616
+ return;
617
+ }
618
+
619
+ for (let i = 0; i < images.length; i++) {
620
+ const img = images[i];
621
+
622
+ // Create canvas and draw image
623
+ const canvas = document.createElement('canvas');
624
+ canvas.width = img.width;
625
+ canvas.height = img.height;
626
+ const ctx = canvas.getContext('2d');
627
+ ctx.drawImage(img, 0, 0, img.width, img.height);
628
+
629
+ // Add image to PDF
630
+ const imgData = canvas.toDataURL('image/jpeg', 1.0);
631
+
632
+ // Add a new page for each image except the first one
633
+ if (i > 0) {
634
+ pdf.addPage();
635
+ }
636
+
637
+ // Calculate dimensions to fit page
638
+ const pageWidth = pdf.internal.pageSize.getWidth();
639
+ const pageHeight = pdf.internal.pageSize.getHeight();
640
+ const imgRatio = img.height / img.width;
641
+
642
+ let imgWidth = pageWidth;
643
+ let imgHeight = imgWidth * imgRatio;
644
+
645
+ // If height exceeds page, scale down
646
+ if (imgHeight > pageHeight) {
647
+ imgHeight = pageHeight;
648
+ imgWidth = imgHeight / imgRatio;
649
+ }
650
+
651
+ // Center image on page
652
+ const x = (pageWidth - imgWidth) / 2;
653
+ const y = (pageHeight - imgHeight) / 2;
654
+
655
+ pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
656
+ }
657
+
658
+ // Get the PDF as base64
659
+ const pdfBase64 = pdf.output('datauristring');
660
+ resolve(pdfBase64);
661
+ });
662
+ """)
663
+
664
+ if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
665
+ logger.warning("Failed to generate PDF with JavaScript method")
666
+ return False
667
+
668
+ # Extract the base64 data and save to file
669
+ base64_data = pdf_data.replace('data:application/pdf;base64,', '')
670
+ pdf_bytes = base64.b64decode(base64_data)
671
+
672
+ with open(save_path, 'wb') as f:
673
+ f.write(pdf_bytes)
674
+
675
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
676
+ logger.info("Successfully downloaded view-only PDF using JavaScript method")
677
+ return True
678
+ else:
679
+ return False
680
+
681
+ except Exception as e:
682
+ logger.error(f"Error in JavaScript PDF download method: {e}")
683
+ return False
684
+
685
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
686
  if not custom_ext_list:
687
  custom_ext_list = []