euler314 commited on
Commit
ed38edb
·
verified ·
1 Parent(s): b9e60db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -78
app.py CHANGED
@@ -352,29 +352,26 @@ class DownloadManager:
352
  file_id = match.group(1)
353
  break
354
  if file_id:
355
- # We'll detect file type during download, so just use the ID for filename initially
 
 
 
356
  filename = f"gdrive_{file_id}"
357
- try:
358
- # Get file info to determine type and size
359
- file_type, is_view_only = await self.get_google_drive_file_info(file_id)
360
- if file_type:
361
- filename = f"{filename}.{file_type}"
362
-
363
- found_files.append({
364
- 'url': href, # Use original URL, as we'll process it specially
365
- 'filename': filename,
366
- 'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"),
367
- 'metadata': {'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id}
368
- })
369
- except Exception as e:
370
- logger.error(f"Error processing Google Drive link: {e}")
371
- # Fallback if we can't get info
372
- found_files.append({
373
- 'url': href,
374
- 'filename': filename,
375
- 'size': "Unknown Size",
376
- 'metadata': {'file_id': file_id}
377
- })
378
 
379
  seen_urls = set()
380
  unique_files = []
@@ -397,13 +394,33 @@ class DownloadManager:
397
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
398
  counter += 1
399
  os.makedirs(save_dir, exist_ok=True)
 
400
  try:
401
  # Special handling for Google Drive files
402
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
403
- # Use enhanced Google Drive downloader
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  success = await self.download_from_google_drive(file_url, path)
405
- return path if success else None
 
406
 
 
 
 
 
 
407
  # Original code for non-Google Drive downloads
408
  async with self.context.new_page() as page:
409
  headers = {
@@ -424,6 +441,213 @@ class DownloadManager:
424
  logger.error(f"Error downloading {file_url}: {e}")
425
  return None
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  async def download_from_google_drive(self, url, save_path):
428
  """Enhanced method to download from Google Drive with multiple fallback approaches"""
429
  # Extract the file ID from different URL formats
@@ -531,60 +755,7 @@ class DownloadManager:
531
  except Exception as e:
532
  logger.warning(f"Requests session download failed: {e}")
533
 
534
- # If all methods failed for view-only file, try one last approach
535
- if is_view_only:
536
- try:
537
- # Try a direct headless browser download
538
- async with self.context.new_page() as page:
539
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
540
-
541
- # Try to capture the content directly from viewer
542
- file_content = await page.evaluate("""
543
- () => {
544
- // Try to find the actual viewer content
545
- const viewerContent = document.querySelector('.drive-viewer-paginated-content');
546
- if (viewerContent) {
547
- return viewerContent.innerHTML;
548
- }
549
- return document.documentElement.innerHTML;
550
- }
551
- """)
552
-
553
- if file_content:
554
- # Save as HTML and then we can convert it if needed
555
- html_path = f"{base}.html"
556
- with open(html_path, 'w', encoding='utf-8') as f:
557
- f.write(f"""
558
- <!DOCTYPE html>
559
- <html>
560
- <head><title>Google Drive Extracted Content</title></head>
561
- <body>
562
- {file_content}
563
- </body>
564
- </html>
565
- """)
566
-
567
- # If requested a PDF, convert HTML to PDF
568
- if file_type == 'pdf' or ext.lower() == '.pdf':
569
- try:
570
- import pdfkit
571
- pdfkit.from_file(html_path, save_path)
572
- os.remove(html_path) # Clean up HTML file
573
- return True
574
- except Exception as pdf_err:
575
- logger.warning(f"Error converting HTML to PDF: {pdf_err}")
576
- # Keep the HTML file as fallback
577
- shutil.copy(html_path, save_path)
578
- return True
579
- else:
580
- # Just use the HTML file
581
- shutil.copy(html_path, save_path)
582
- return True
583
- except Exception as e:
584
- logger.warning(f"Final direct browser capture failed: {e}")
585
-
586
- # All methods failed
587
- logger.error(f"All download approaches failed for Google Drive file: {file_id}")
588
  return False
589
 
590
  async def get_google_drive_file_info(self, file_id):
 
352
  file_id = match.group(1)
353
  break
354
  if file_id:
355
+ # Get file info to determine type and view-only status
356
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
357
+
358
+ # Create a more informative filename based on info
359
  filename = f"gdrive_{file_id}"
360
+ if file_type:
361
+ filename = f"{filename}.{file_type}"
362
+
363
+ size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
364
+
365
+ found_files.append({
366
+ 'url': href, # Use original URL
367
+ 'filename': filename,
368
+ 'size': size_str,
369
+ 'metadata': {
370
+ 'view_only': is_view_only,
371
+ 'file_type': file_type,
372
+ 'file_id': file_id
373
+ }
374
+ })
 
 
 
 
 
 
375
 
376
  seen_urls = set()
377
  unique_files = []
 
394
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
395
  counter += 1
396
  os.makedirs(save_dir, exist_ok=True)
397
+
398
  try:
399
  # Special handling for Google Drive files
400
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
401
+ # Check if it's marked as view-only in metadata
402
+ is_view_only = file_info.get('metadata', {}).get('view_only', False)
403
+
404
+ # For view-only files, try our most robust approach first
405
+ if is_view_only:
406
+ logger.info(f"Attempting to download view-only file: {file_url}")
407
+ result_path = await self.force_download_viewonly(file_info, path)
408
+ if result_path:
409
+ return result_path
410
+
411
+ # If that failed, try the regular download approach
412
+ logger.info("Primary method failed, trying fallback methods")
413
+
414
+ # Try regular download methods
415
  success = await self.download_from_google_drive(file_url, path)
416
+ if success:
417
+ return path
418
 
419
+ # If all methods failed for Google Drive, try one last approach
420
+ logger.warning("All standard methods failed, attempting force download")
421
+ result_path = await self.force_download_viewonly(file_info, path)
422
+ return result_path if result_path else None
423
+
424
  # Original code for non-Google Drive downloads
425
  async with self.context.new_page() as page:
426
  headers = {
 
441
  logger.error(f"Error downloading {file_url}: {e}")
442
  return None
443
 
444
+ async def force_download_viewonly(self, file_info, save_path):
445
+ """Last-resort method to download view-only Google Drive files"""
446
+ try:
447
+ # Extract file ID from URL
448
+ file_id = None
449
+ url = file_info['url']
450
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
451
+ match = re.search(pattern, url)
452
+ if match:
453
+ file_id = match.group(1)
454
+ break
455
+
456
+ if not file_id:
457
+ logger.error("Could not extract file ID")
458
+ return None
459
+
460
+ logger.info(f"Force downloading view-only file with ID: {file_id}")
461
+
462
+ # Make sure we have the proper file extension
463
+ base, ext = os.path.splitext(save_path)
464
+ if not ext:
465
+ # Determine file type from metadata or set default to PDF
466
+ file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
467
+ save_path = f"{base}.{file_type}"
468
+
469
+ # Launch a new browser context with higher resolution
470
+ browser = await self.playwright.chromium.launch(
471
+ headless=True,
472
+ args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
473
+ )
474
+
475
+ context = await browser.new_context(
476
+ viewport={'width': 1600, 'height': 1200},
477
+ user_agent=get_random_user_agent(),
478
+ device_scale_factor=2.0 # Higher resolution for better quality
479
+ )
480
+
481
+ page = await context.new_page()
482
+
483
+ # Navigate to the file
484
+ try:
485
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
486
+ wait_until='networkidle',
487
+ timeout=60000)
488
+
489
+ # Wait for content to load fully
490
+ await page.wait_for_timeout(5000)
491
+
492
+ # Check if it's a PDF
493
+ is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
494
+
495
+ if is_pdf:
496
+ # For PDFs: Screenshot each page approach
497
+ logger.info("Detected PDF, using page-by-page screenshot approach")
498
+
499
+ # Scroll through document to ensure all pages are loaded
500
+ await page.evaluate("""
501
+ async function scrollDocument() {
502
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
503
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
504
+ if (!container) return;
505
+
506
+ // First scroll to bottom to load all pages
507
+ container.scrollTo(0, container.scrollHeight);
508
+ await delay(2000);
509
+
510
+ // Then back to top
511
+ container.scrollTo(0, 0);
512
+ await delay(1000);
513
+ }
514
+ return scrollDocument();
515
+ """)
516
+
517
+ # Count pages
518
+ page_count = await page.evaluate("""
519
+ () => {
520
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
521
+ return pages.length;
522
+ }
523
+ """)
524
+
525
+ if page_count == 0:
526
+ logger.warning("No pages found, trying alternative method")
527
+ # Take a screenshot of the entire page
528
+ temp_dir = tempfile.mkdtemp()
529
+ screenshot_path = os.path.join(temp_dir, "page.png")
530
+ await page.screenshot(path=screenshot_path, full_page=True)
531
+
532
+ # Convert screenshot to PDF
533
+ from PIL import Image
534
+ from reportlab.pdfgen import canvas as pdf_canvas
535
+ img = Image.open(screenshot_path)
536
+ width, height = img.size
537
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
538
+ c.drawImage(screenshot_path, 0, 0, width, height)
539
+ c.save()
540
+
541
+ # Clean up
542
+ os.remove(screenshot_path)
543
+ os.rmdir(temp_dir)
544
+ else:
545
+ # Create temp directory for page screenshots
546
+ temp_dir = tempfile.mkdtemp()
547
+ screenshots = []
548
+
549
+ # Take screenshot of each page
550
+ for i in range(page_count):
551
+ # Scroll to page
552
+ await page.evaluate(f"""
553
+ async () => {{
554
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
555
+ if (pages.length <= {i}) return;
556
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
557
+ pages[{i}].scrollIntoView();
558
+ await delay(500);
559
+ }}
560
+ """)
561
+
562
+ # Take screenshot
563
+ screenshot_path = os.path.join(temp_dir, f"page_{i+1}.png")
564
+
565
+ # Position page for best screenshot
566
+ await page.evaluate(f"""
567
+ () => {{
568
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
569
+ const page = pages[{i}];
570
+ const viewer = document.querySelector('.drive-viewer-paginated-scrollable');
571
+ if (page && viewer) {{
572
+ // Center the page in the viewport
573
+ const rect = page.getBoundingClientRect();
574
+ viewer.scrollBy(0, rect.top - 100);
575
+ }}
576
+ }}
577
+ """)
578
+
579
+ await page.screenshot(path=screenshot_path)
580
+ screenshots.append(screenshot_path)
581
+
582
+ # Combine screenshots into PDF
583
+ from reportlab.lib.pagesizes import letter
584
+ from reportlab.pdfgen import canvas as pdf_canvas
585
+ from PIL import Image
586
+
587
+ # Use the first image dimensions to determine page size
588
+ img = Image.open(screenshots[0])
589
+ img_width, img_height = img.size
590
+
591
+ c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
592
+
593
+ for screenshot in screenshots:
594
+ img = Image.open(screenshot)
595
+ c.drawImage(screenshot, 0, 0, img_width, img_height)
596
+ c.showPage()
597
+
598
+ c.save()
599
+
600
+ # Clean up
601
+ for screenshot in screenshots:
602
+ os.remove(screenshot)
603
+ os.rmdir(temp_dir)
604
+ else:
605
+ # For other file types: Take a single screenshot
606
+ temp_dir = tempfile.mkdtemp()
607
+ screenshot_path = os.path.join(temp_dir, "screenshot.png")
608
+ await page.screenshot(path=screenshot_path, full_page=True)
609
+
610
+ # Determine final file type
611
+ base, ext = os.path.splitext(save_path)
612
+
613
+ if ext.lower() == '.pdf':
614
+ # Convert to PDF
615
+ from PIL import Image
616
+ from reportlab.pdfgen import canvas as pdf_canvas
617
+ img = Image.open(screenshot_path)
618
+ width, height = img.size
619
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
620
+ c.drawImage(screenshot_path, 0, 0, width, height)
621
+ c.save()
622
+ else:
623
+ # Just copy the screenshot with the appropriate extension
624
+ shutil.copy(screenshot_path, save_path)
625
+
626
+ # Clean up
627
+ os.remove(screenshot_path)
628
+ os.rmdir(temp_dir)
629
+
630
+ # Close browser
631
+ await browser.close()
632
+
633
+ # Verify file exists and is not empty
634
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
635
+ logger.info(f"Successfully downloaded view-only file to {save_path}")
636
+ return save_path
637
+ else:
638
+ logger.error(f"Failed to create valid file at {save_path}")
639
+ return None
640
+
641
+ except Exception as e:
642
+ logger.error(f"Error during force download: {e}")
643
+ if browser:
644
+ await browser.close()
645
+ return None
646
+
647
+ except Exception as e:
648
+ logger.error(f"Force download failed: {e}")
649
+ return None
650
+
651
  async def download_from_google_drive(self, url, save_path):
652
  """Enhanced method to download from Google Drive with multiple fallback approaches"""
653
  # Extract the file ID from different URL formats
 
755
  except Exception as e:
756
  logger.warning(f"Requests session download failed: {e}")
757
 
758
+ logger.warning("Standard download methods failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  return False
760
 
761
  async def get_google_drive_file_info(self, file_id):