euler314 commited on
Commit
82c1030
·
verified ·
1 Parent(s): 907ffd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -131
app.py CHANGED
@@ -920,177 +920,248 @@ class DownloadManager:
920
  return False
921
 
922
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
923
- """Download view-only PDF using JavaScript approach - improved version"""
924
  try:
925
- async with self.context.new_page() as page:
926
- # Set viewport size to ensure we capture full pages
927
- await page.set_viewport_size({"width": 1200, "height": 1600})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928
 
929
- # Visit the file
930
- view_url = f"https://drive.google.com/file/d/{file_id}/view"
931
- await page.goto(view_url, wait_until='networkidle', timeout=60000)
 
 
932
 
933
- # Wait for initial rendering
934
- await page.wait_for_timeout(2000)
935
 
936
- # CRITICAL: Scroll through entire document to ensure all content is cached
937
- await page.evaluate("""
938
- async function scrollThroughDocument() {
939
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
940
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
941
- if (!container) return false;
942
 
943
- // Get total scroll height
944
- const scrollHeight = container.scrollHeight;
 
 
 
 
 
 
 
 
 
945
  const viewportHeight = container.clientHeight;
 
 
 
 
 
946
 
947
- // Scroll down in increments to load all pages
948
- for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += viewportHeight) {
949
- container.scrollTo(0, scrollPos);
950
- await delay(800); // Wait for content to load
 
 
 
 
 
 
951
  }
952
 
953
- // One final scroll to bottom to ensure everything is loaded
954
- container.scrollTo(0, scrollHeight);
955
  await delay(1500);
956
 
957
  // Scroll back to top for PDF creation
958
  container.scrollTo(0, 0);
959
- await delay(800);
960
 
961
  return true;
962
  }
963
- return scrollThroughDocument();
964
  """)
965
 
966
- # Use simplified script similar to the one provided
967
- pdf_base64 = await page.evaluate("""
968
- async function createPDF() {
 
 
 
 
 
 
 
 
969
  try {
970
- // Create jsPDF script element
971
- const loadJsPDF = () => new Promise((resolve, reject) => {
972
  let jspdf = document.createElement("script");
973
- jspdf.onload = () => resolve();
974
- jspdf.onerror = () => reject(new Error("Failed to load jsPDF"));
975
- jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
976
- document.body.appendChild(jspdf);
977
- });
978
-
979
- await loadJsPDF();
980
-
981
- // Create PDF
982
- const { jsPDF } = window.jspdf;
983
- let pdf = new jsPDF();
984
- let elements = document.getElementsByTagName("img");
985
- let pageCount = 0;
986
-
987
- // First pass to find and sort all valid page images
988
- let pageImages = [];
989
- for (let i = 0; i < elements.length; i++) {
990
- let img = elements[i];
991
- // Only process blob images (these are the PDF pages)
992
- if (!/^blob:/.test(img.src)) continue;
993
-
994
- // Skip tiny images (usually icons, not content)
995
- if (img.width < 100 || img.height < 100) continue;
996
-
997
- pageImages.push(img);
998
- }
999
-
1000
- // Sort images by their position if possible
1001
- try {
1002
- pageImages.sort((a, b) => {
1003
- const rectA = a.getBoundingClientRect();
1004
- const rectB = b.getBoundingClientRect();
1005
- return rectA.top - rectB.top;
1006
- });
1007
- } catch (e) {
1008
- console.error("Error sorting images:", e);
1009
- }
1010
-
1011
- // Process each image as a page
1012
- for (let i = 0; i < pageImages.length; i++) {
1013
- let img = pageImages[i];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
 
1015
- // Create canvas to draw the image
1016
- let canvasElement = document.createElement('canvas');
1017
- let con = canvasElement.getContext("2d");
1018
- canvasElement.width = img.width;
1019
- canvasElement.height = img.height;
1020
 
1021
- // Draw image to canvas
1022
- con.drawImage(img, 0, 0, img.width, img.height);
1023
-
1024
- // Add image to PDF
1025
- let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
1026
-
1027
- // Add a new page for each page after the first
1028
- if (pageCount > 0) {
1029
- pdf.addPage();
1030
- }
1031
-
1032
- // Calculate dimensions to fit the page
1033
- const pageWidth = pdf.internal.pageSize.getWidth();
1034
- const pageHeight = pdf.internal.pageSize.getHeight();
1035
- const imgRatio = img.height / img.width;
1036
-
1037
- let imgWidth = pageWidth;
1038
- let imgHeight = imgWidth * imgRatio;
1039
-
1040
- if (imgHeight > pageHeight) {
1041
- imgHeight = pageHeight;
1042
- imgWidth = imgHeight / imgRatio;
1043
- }
1044
-
1045
- // Center on page
1046
- const x = (pageWidth - imgWidth) / 2;
1047
- const y = (pageHeight - imgHeight) / 2;
1048
-
1049
- pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
1050
- pageCount++;
1051
- }
1052
-
1053
- if (pageCount === 0) {
1054
- return null; // No pages found
1055
- }
1056
-
1057
- // Return as base64
1058
- return pdf.output('datauristring');
1059
  } catch (e) {
1060
- console.error("PDF creation error:", e);
1061
- return null;
1062
  }
1063
  }
1064
- return createPDF();
 
1065
  """)
1066
 
1067
- if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
1068
- logger.warning("PDF creation script failed, trying fallback method")
1069
- return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
 
 
 
 
 
 
 
 
1070
 
1071
- # Save the PDF from base64
1072
  try:
1073
- base64_data = pdf_base64.replace('data:application/pdf;base64,', '')
1074
  pdf_bytes = base64.b64decode(base64_data)
1075
 
1076
  with open(save_path, 'wb') as f:
1077
  f.write(pdf_bytes)
1078
 
1079
- # Verify file is not empty
 
 
 
1080
  if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
1081
- logger.info(f"Successfully saved PDF to {save_path}")
1082
  return True
1083
  else:
1084
- logger.warning(f"Generated PDF is too small, using fallback method")
1085
- return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1086
  except Exception as e:
1087
- logger.error(f"Error saving PDF: {e}")
1088
- return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1089
-
 
 
 
1090
  except Exception as e:
1091
- logger.error(f"Error in view-only PDF download: {e}")
1092
- # Try fallback method
1093
- return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1094
 
1095
  async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
1096
  """Download any view-only file by taking screenshots"""
 
920
  return False
921
 
922
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
923
+ """Improved method that replicates the manual process for downloading view-only PDFs"""
924
  try:
925
+ # Create a fresh browser context with extended timeout
926
+ browser = await self.playwright.chromium.launch(
927
+ headless=True,
928
+ args=[
929
+ '--no-sandbox',
930
+ '--disable-setuid-sandbox',
931
+ '--disable-dev-shm-usage',
932
+ '--disable-web-security'
933
+ ]
934
+ )
935
+
936
+ # Use high DPI for better quality
937
+ context = await browser.new_context(
938
+ viewport={'width': 1600, 'height': 1200},
939
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
940
+ device_scale_factor=2.0,
941
+ timeout=120000 # Longer timeout
942
+ )
943
+
944
+ page = await context.new_page()
945
+
946
+ try:
947
+ logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
948
+
949
+ # Step 1: Navigate to the PDF and wait for it to load fully
950
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
951
+ await page.wait_for_load_state('networkidle')
952
+ await page.wait_for_timeout(3000) # Additional wait for JavaScript to initialize
953
 
954
+ # Check if we have a PDF viewer
955
+ viewer_loaded = await page.query_selector('.drive-viewer-paginated-scrollable, .drive-viewer-paginated-page')
956
+ if not viewer_loaded:
957
+ logger.warning("PDF viewer not detected. This might not be a PDF or might be using a different viewer.")
958
+ # Continue anyway, as it might just be a different CSS class
959
 
960
+ # Step 2: Scroll through the entire document to ensure all pages are loaded
961
+ logger.info("Scrolling through document to load all pages into cache...")
962
 
963
+ # This is CRITICAL - scroll all the way down to ensure all pages are loaded and cached
964
+ scroll_success = await page.evaluate("""
965
+ async function scrollThroughEntireDocument() {
966
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
 
 
967
 
968
+ // Try multiple container selectors that might exist in Google Drive
969
+ const container = document.querySelector('.drive-viewer-paginated-scrollable') ||
970
+ document.querySelector('.drive-viewer-container');
971
+
972
+ if (!container) {
973
+ console.log('No scroll container found');
974
+ return false;
975
+ }
976
+
977
+ // Get total height to scroll
978
+ const totalHeight = container.scrollHeight;
979
  const viewportHeight = container.clientHeight;
980
+ console.log(`Document height: ${totalHeight}px, Viewport: ${viewportHeight}px`);
981
+
982
+ // First scroll quickly to the bottom to trigger loading all content
983
+ container.scrollTo(0, totalHeight);
984
+ await delay(2000);
985
 
986
+ // Then scroll gradually to ensure everything is properly loaded
987
+ const scrollSteps = 20; // Number of steps to divide the scroll
988
+ const stepSize = totalHeight / scrollSteps;
989
+
990
+ // Scroll down in steps
991
+ for (let i = 0; i < scrollSteps; i++) {
992
+ const targetPos = i * stepSize;
993
+ container.scrollTo(0, targetPos);
994
+ console.log(`Scrolled to ${targetPos}px`);
995
+ await delay(300); // Wait between scrolls
996
  }
997
 
998
+ // Final scroll to the very bottom
999
+ container.scrollTo(0, totalHeight);
1000
  await delay(1500);
1001
 
1002
  // Scroll back to top for PDF creation
1003
  container.scrollTo(0, 0);
1004
+ await delay(1000);
1005
 
1006
  return true;
1007
  }
1008
+ return scrollThroughEntireDocument();
1009
  """)
1010
 
1011
+ if not scroll_success:
1012
+ logger.warning("Scrolling may not have completed successfully. Will try to download anyway.")
1013
+
1014
+ # Step 3: Wait to ensure all content is properly loaded after scrolling
1015
+ await page.wait_for_timeout(2000)
1016
+
1017
+ # Step 4: Execute the jsPDF script, similar to the manual process
1018
+ logger.info("Executing jsPDF script to create and download PDF...")
1019
+
1020
+ pdf_result = await page.evaluate("""
1021
+ async function downloadPDFWithJsPDF() {
1022
  try {
1023
+ // Create and load jsPDF script
1024
+ return new Promise((resolve, reject) => {
1025
  let jspdf = document.createElement("script");
1026
+ jspdf.onload = function() {
1027
+ try {
1028
+ // This is the core PDF creation logic
1029
+ let pdf = new jsPDF();
1030
+ let elements = document.getElementsByTagName("img");
1031
+ let pageCount = 0;
1032
+
1033
+ // First collect and sort the images
1034
+ let validImages = [];
1035
+ for (let i = 0; i < elements.length; i++) {
1036
+ let img = elements[i];
1037
+ // Only include blob images (PDF page images)
1038
+ if (!/^blob:/.test(img.src)) {
1039
+ continue;
1040
+ }
1041
+ // Exclude small images (usually icons)
1042
+ if (img.width < 100 || img.height < 100) {
1043
+ continue;
1044
+ }
1045
+ validImages.push(img);
1046
+ }
1047
+
1048
+ // Sort by position from top to bottom
1049
+ validImages.sort((a, b) => {
1050
+ let rectA = a.getBoundingClientRect();
1051
+ let rectB = b.getBoundingClientRect();
1052
+ return rectA.top - rectB.top;
1053
+ });
1054
+
1055
+ console.log(`Found ${validImages.length} valid page images`);
1056
+ if (validImages.length === 0) {
1057
+ reject("No valid PDF page images found");
1058
+ return;
1059
+ }
1060
+
1061
+ // Process each image
1062
+ for (let i = 0; i < validImages.length; i++) {
1063
+ let img = validImages[i];
1064
+
1065
+ // Create canvas and draw image
1066
+ let canvasElement = document.createElement('canvas');
1067
+ let con = canvasElement.getContext('2d');
1068
+ canvasElement.width = img.width;
1069
+ canvasElement.height = img.height;
1070
+
1071
+ try {
1072
+ // Draw the image to canvas
1073
+ con.drawImage(img, 0, 0, img.width, img.height);
1074
+
1075
+ // Convert to JPEG
1076
+ let imgData = canvasElement.toDataURL("image/jpeg", 1.0);
1077
+
1078
+ // Add a new page for each page after the first
1079
+ if (pageCount > 0) {
1080
+ pdf.addPage();
1081
+ }
1082
+
1083
+ // Add image to PDF
1084
+ pdf.addImage(imgData, 'JPEG', 0, 0, pdf.internal.pageSize.getWidth(), pdf.internal.pageSize.getHeight());
1085
+ pageCount++;
1086
+ } catch (e) {
1087
+ console.error("Error processing image:", e);
1088
+ }
1089
+ }
1090
+
1091
+ if (pageCount === 0) {
1092
+ reject("Failed to add any pages to PDF");
1093
+ return;
1094
+ }
1095
+
1096
+ // Return PDF as data URL
1097
+ let pdfOutput = pdf.output('datauristring');
1098
+ resolve({
1099
+ success: true,
1100
+ data: pdfOutput,
1101
+ pageCount: pageCount
1102
+ });
1103
+ } catch (e) {
1104
+ console.error("Error in PDF creation:", e);
1105
+ reject("Error creating PDF: " + e.message);
1106
+ }
1107
+ };
1108
 
1109
+ jspdf.onerror = function() {
1110
+ reject("Failed to load jsPDF library");
1111
+ };
 
 
1112
 
1113
+ // Use a reliable CDN for jsPDF
1114
+ jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.3.2/jspdf.min.js';
1115
+ document.body.appendChild(jspdf);
1116
+ });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1117
  } catch (e) {
1118
+ console.error("Overall error:", e);
1119
+ return { success: false, error: e.message };
1120
  }
1121
  }
1122
+
1123
+ return downloadPDFWithJsPDF();
1124
  """)
1125
 
1126
+ # Step 5: Process the result
1127
+ if not pdf_result or not isinstance(pdf_result, dict) or not pdf_result.get('success'):
1128
+ error_msg = pdf_result.get('error') if isinstance(pdf_result, dict) else "Unknown error"
1129
+ logger.error(f"Failed to create PDF: {error_msg}")
1130
+ return False
1131
+
1132
+ # Extract base64 data
1133
+ pdf_data = pdf_result.get('data')
1134
+ if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
1135
+ logger.error("Invalid PDF data returned")
1136
+ return False
1137
 
1138
+ # Save the PDF
1139
  try:
1140
+ base64_data = pdf_data.replace('data:application/pdf;base64,', '')
1141
  pdf_bytes = base64.b64decode(base64_data)
1142
 
1143
  with open(save_path, 'wb') as f:
1144
  f.write(pdf_bytes)
1145
 
1146
+ page_count = pdf_result.get('pageCount', 0)
1147
+ logger.info(f"Successfully saved PDF with {page_count} pages to {save_path}")
1148
+
1149
+ # Verify file
1150
  if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
 
1151
  return True
1152
  else:
1153
+ logger.error("Generated PDF file is too small or empty")
1154
+ return False
1155
  except Exception as e:
1156
+ logger.error(f"Error saving PDF file: {e}")
1157
+ return False
1158
+
1159
+ finally:
1160
+ await browser.close()
1161
+
1162
  except Exception as e:
1163
+ logger.error(f"Error in viewonly PDF download process: {e}")
1164
+ return False
 
1165
 
1166
  async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
1167
  """Download any view-only file by taking screenshots"""