Update app.py
Browse files
app.py
CHANGED
@@ -920,177 +920,248 @@ class DownloadManager:
|
|
920 |
return False
|
921 |
|
922 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
923 |
-
"""
|
924 |
try:
|
925 |
-
|
926 |
-
|
927 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
|
929 |
-
#
|
930 |
-
|
931 |
-
|
|
|
|
|
932 |
|
933 |
-
#
|
934 |
-
|
935 |
|
936 |
-
# CRITICAL
|
937 |
-
await page.evaluate("""
|
938 |
-
async function
|
939 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
940 |
-
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
941 |
-
if (!container) return false;
|
942 |
|
943 |
-
//
|
944 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
945 |
const viewportHeight = container.clientHeight;
|
|
|
|
|
|
|
|
|
|
|
946 |
|
947 |
-
//
|
948 |
-
|
949 |
-
|
950 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
951 |
}
|
952 |
|
953 |
-
//
|
954 |
-
container.scrollTo(0,
|
955 |
await delay(1500);
|
956 |
|
957 |
// Scroll back to top for PDF creation
|
958 |
container.scrollTo(0, 0);
|
959 |
-
await delay(
|
960 |
|
961 |
return true;
|
962 |
}
|
963 |
-
return
|
964 |
""")
|
965 |
|
966 |
-
|
967 |
-
|
968 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
969 |
try {
|
970 |
-
// Create jsPDF script
|
971 |
-
|
972 |
let jspdf = document.createElement("script");
|
973 |
-
jspdf.onload = ()
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
-
|
1013 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1014 |
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
canvasElement.width = img.width;
|
1019 |
-
canvasElement.height = img.height;
|
1020 |
|
1021 |
-
//
|
1022 |
-
|
1023 |
-
|
1024 |
-
|
1025 |
-
let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
|
1026 |
-
|
1027 |
-
// Add a new page for each page after the first
|
1028 |
-
if (pageCount > 0) {
|
1029 |
-
pdf.addPage();
|
1030 |
-
}
|
1031 |
-
|
1032 |
-
// Calculate dimensions to fit the page
|
1033 |
-
const pageWidth = pdf.internal.pageSize.getWidth();
|
1034 |
-
const pageHeight = pdf.internal.pageSize.getHeight();
|
1035 |
-
const imgRatio = img.height / img.width;
|
1036 |
-
|
1037 |
-
let imgWidth = pageWidth;
|
1038 |
-
let imgHeight = imgWidth * imgRatio;
|
1039 |
-
|
1040 |
-
if (imgHeight > pageHeight) {
|
1041 |
-
imgHeight = pageHeight;
|
1042 |
-
imgWidth = imgHeight / imgRatio;
|
1043 |
-
}
|
1044 |
-
|
1045 |
-
// Center on page
|
1046 |
-
const x = (pageWidth - imgWidth) / 2;
|
1047 |
-
const y = (pageHeight - imgHeight) / 2;
|
1048 |
-
|
1049 |
-
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
1050 |
-
pageCount++;
|
1051 |
-
}
|
1052 |
-
|
1053 |
-
if (pageCount === 0) {
|
1054 |
-
return null; // No pages found
|
1055 |
-
}
|
1056 |
-
|
1057 |
-
// Return as base64
|
1058 |
-
return pdf.output('datauristring');
|
1059 |
} catch (e) {
|
1060 |
-
console.error("
|
1061 |
-
return
|
1062 |
}
|
1063 |
}
|
1064 |
-
|
|
|
1065 |
""")
|
1066 |
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1070 |
|
1071 |
-
# Save the PDF
|
1072 |
try:
|
1073 |
-
base64_data =
|
1074 |
pdf_bytes = base64.b64decode(base64_data)
|
1075 |
|
1076 |
with open(save_path, 'wb') as f:
|
1077 |
f.write(pdf_bytes)
|
1078 |
|
1079 |
-
|
|
|
|
|
|
|
1080 |
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
|
1081 |
-
logger.info(f"Successfully saved PDF to {save_path}")
|
1082 |
return True
|
1083 |
else:
|
1084 |
-
logger.
|
1085 |
-
return
|
1086 |
except Exception as e:
|
1087 |
-
logger.error(f"Error saving PDF: {e}")
|
1088 |
-
return
|
1089 |
-
|
|
|
|
|
|
|
1090 |
except Exception as e:
|
1091 |
-
logger.error(f"Error in
|
1092 |
-
|
1093 |
-
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
1094 |
|
1095 |
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
|
1096 |
"""Download any view-only file by taking screenshots"""
|
|
|
920 |
return False
|
921 |
|
922 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
923 |
+
"""Improved method that replicates the manual process for downloading view-only PDFs"""
|
924 |
try:
|
925 |
+
# Create a fresh browser context with extended timeout
|
926 |
+
browser = await self.playwright.chromium.launch(
|
927 |
+
headless=True,
|
928 |
+
args=[
|
929 |
+
'--no-sandbox',
|
930 |
+
'--disable-setuid-sandbox',
|
931 |
+
'--disable-dev-shm-usage',
|
932 |
+
'--disable-web-security'
|
933 |
+
]
|
934 |
+
)
|
935 |
+
|
936 |
+
# Use high DPI for better quality
|
937 |
+
context = await browser.new_context(
|
938 |
+
viewport={'width': 1600, 'height': 1200},
|
939 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
940 |
+
device_scale_factor=2.0,
|
941 |
+
timeout=120000 # Longer timeout
|
942 |
+
)
|
943 |
+
|
944 |
+
page = await context.new_page()
|
945 |
+
|
946 |
+
try:
|
947 |
+
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
|
948 |
+
|
949 |
+
# Step 1: Navigate to the PDF and wait for it to load fully
|
950 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
|
951 |
+
await page.wait_for_load_state('networkidle')
|
952 |
+
await page.wait_for_timeout(3000) # Additional wait for JavaScript to initialize
|
953 |
|
954 |
+
# Check if we have a PDF viewer
|
955 |
+
viewer_loaded = await page.query_selector('.drive-viewer-paginated-scrollable, .drive-viewer-paginated-page')
|
956 |
+
if not viewer_loaded:
|
957 |
+
logger.warning("PDF viewer not detected. This might not be a PDF or might be using a different viewer.")
|
958 |
+
# Continue anyway, as it might just be a different CSS class
|
959 |
|
960 |
+
# Step 2: Scroll through the entire document to ensure all pages are loaded
|
961 |
+
logger.info("Scrolling through document to load all pages into cache...")
|
962 |
|
963 |
+
# This is CRITICAL - scroll all the way down to ensure all pages are loaded and cached
|
964 |
+
scroll_success = await page.evaluate("""
|
965 |
+
async function scrollThroughEntireDocument() {
|
966 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
|
967 |
|
968 |
+
// Try multiple container selectors that might exist in Google Drive
|
969 |
+
const container = document.querySelector('.drive-viewer-paginated-scrollable') ||
|
970 |
+
document.querySelector('.drive-viewer-container');
|
971 |
+
|
972 |
+
if (!container) {
|
973 |
+
console.log('No scroll container found');
|
974 |
+
return false;
|
975 |
+
}
|
976 |
+
|
977 |
+
// Get total height to scroll
|
978 |
+
const totalHeight = container.scrollHeight;
|
979 |
const viewportHeight = container.clientHeight;
|
980 |
+
console.log(`Document height: ${totalHeight}px, Viewport: ${viewportHeight}px`);
|
981 |
+
|
982 |
+
// First scroll quickly to the bottom to trigger loading all content
|
983 |
+
container.scrollTo(0, totalHeight);
|
984 |
+
await delay(2000);
|
985 |
|
986 |
+
// Then scroll gradually to ensure everything is properly loaded
|
987 |
+
const scrollSteps = 20; // Number of steps to divide the scroll
|
988 |
+
const stepSize = totalHeight / scrollSteps;
|
989 |
+
|
990 |
+
// Scroll down in steps
|
991 |
+
for (let i = 0; i < scrollSteps; i++) {
|
992 |
+
const targetPos = i * stepSize;
|
993 |
+
container.scrollTo(0, targetPos);
|
994 |
+
console.log(`Scrolled to ${targetPos}px`);
|
995 |
+
await delay(300); // Wait between scrolls
|
996 |
}
|
997 |
|
998 |
+
// Final scroll to the very bottom
|
999 |
+
container.scrollTo(0, totalHeight);
|
1000 |
await delay(1500);
|
1001 |
|
1002 |
// Scroll back to top for PDF creation
|
1003 |
container.scrollTo(0, 0);
|
1004 |
+
await delay(1000);
|
1005 |
|
1006 |
return true;
|
1007 |
}
|
1008 |
+
return scrollThroughEntireDocument();
|
1009 |
""")
|
1010 |
|
1011 |
+
if not scroll_success:
|
1012 |
+
logger.warning("Scrolling may not have completed successfully. Will try to download anyway.")
|
1013 |
+
|
1014 |
+
# Step 3: Wait to ensure all content is properly loaded after scrolling
|
1015 |
+
await page.wait_for_timeout(2000)
|
1016 |
+
|
1017 |
+
# Step 4: Execute the jsPDF script, similar to the manual process
|
1018 |
+
logger.info("Executing jsPDF script to create and download PDF...")
|
1019 |
+
|
1020 |
+
pdf_result = await page.evaluate("""
|
1021 |
+
async function downloadPDFWithJsPDF() {
|
1022 |
try {
|
1023 |
+
// Create and load jsPDF script
|
1024 |
+
return new Promise((resolve, reject) => {
|
1025 |
let jspdf = document.createElement("script");
|
1026 |
+
jspdf.onload = function() {
|
1027 |
+
try {
|
1028 |
+
// This is the core PDF creation logic
|
1029 |
+
let pdf = new jsPDF();
|
1030 |
+
let elements = document.getElementsByTagName("img");
|
1031 |
+
let pageCount = 0;
|
1032 |
+
|
1033 |
+
// First collect and sort the images
|
1034 |
+
let validImages = [];
|
1035 |
+
for (let i = 0; i < elements.length; i++) {
|
1036 |
+
let img = elements[i];
|
1037 |
+
// Only include blob images (PDF page images)
|
1038 |
+
if (!/^blob:/.test(img.src)) {
|
1039 |
+
continue;
|
1040 |
+
}
|
1041 |
+
// Exclude small images (usually icons)
|
1042 |
+
if (img.width < 100 || img.height < 100) {
|
1043 |
+
continue;
|
1044 |
+
}
|
1045 |
+
validImages.push(img);
|
1046 |
+
}
|
1047 |
+
|
1048 |
+
// Sort by position from top to bottom
|
1049 |
+
validImages.sort((a, b) => {
|
1050 |
+
let rectA = a.getBoundingClientRect();
|
1051 |
+
let rectB = b.getBoundingClientRect();
|
1052 |
+
return rectA.top - rectB.top;
|
1053 |
+
});
|
1054 |
+
|
1055 |
+
console.log(`Found ${validImages.length} valid page images`);
|
1056 |
+
if (validImages.length === 0) {
|
1057 |
+
reject("No valid PDF page images found");
|
1058 |
+
return;
|
1059 |
+
}
|
1060 |
+
|
1061 |
+
// Process each image
|
1062 |
+
for (let i = 0; i < validImages.length; i++) {
|
1063 |
+
let img = validImages[i];
|
1064 |
+
|
1065 |
+
// Create canvas and draw image
|
1066 |
+
let canvasElement = document.createElement('canvas');
|
1067 |
+
let con = canvasElement.getContext('2d');
|
1068 |
+
canvasElement.width = img.width;
|
1069 |
+
canvasElement.height = img.height;
|
1070 |
+
|
1071 |
+
try {
|
1072 |
+
// Draw the image to canvas
|
1073 |
+
con.drawImage(img, 0, 0, img.width, img.height);
|
1074 |
+
|
1075 |
+
// Convert to JPEG
|
1076 |
+
let imgData = canvasElement.toDataURL("image/jpeg", 1.0);
|
1077 |
+
|
1078 |
+
// Add a new page for each page after the first
|
1079 |
+
if (pageCount > 0) {
|
1080 |
+
pdf.addPage();
|
1081 |
+
}
|
1082 |
+
|
1083 |
+
// Add image to PDF
|
1084 |
+
pdf.addImage(imgData, 'JPEG', 0, 0, pdf.internal.pageSize.getWidth(), pdf.internal.pageSize.getHeight());
|
1085 |
+
pageCount++;
|
1086 |
+
} catch (e) {
|
1087 |
+
console.error("Error processing image:", e);
|
1088 |
+
}
|
1089 |
+
}
|
1090 |
+
|
1091 |
+
if (pageCount === 0) {
|
1092 |
+
reject("Failed to add any pages to PDF");
|
1093 |
+
return;
|
1094 |
+
}
|
1095 |
+
|
1096 |
+
// Return PDF as data URL
|
1097 |
+
let pdfOutput = pdf.output('datauristring');
|
1098 |
+
resolve({
|
1099 |
+
success: true,
|
1100 |
+
data: pdfOutput,
|
1101 |
+
pageCount: pageCount
|
1102 |
+
});
|
1103 |
+
} catch (e) {
|
1104 |
+
console.error("Error in PDF creation:", e);
|
1105 |
+
reject("Error creating PDF: " + e.message);
|
1106 |
+
}
|
1107 |
+
};
|
1108 |
|
1109 |
+
jspdf.onerror = function() {
|
1110 |
+
reject("Failed to load jsPDF library");
|
1111 |
+
};
|
|
|
|
|
1112 |
|
1113 |
+
// Use a reliable CDN for jsPDF
|
1114 |
+
jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.3.2/jspdf.min.js';
|
1115 |
+
document.body.appendChild(jspdf);
|
1116 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1117 |
} catch (e) {
|
1118 |
+
console.error("Overall error:", e);
|
1119 |
+
return { success: false, error: e.message };
|
1120 |
}
|
1121 |
}
|
1122 |
+
|
1123 |
+
return downloadPDFWithJsPDF();
|
1124 |
""")
|
1125 |
|
1126 |
+
# Step 5: Process the result
|
1127 |
+
if not pdf_result or not isinstance(pdf_result, dict) or not pdf_result.get('success'):
|
1128 |
+
error_msg = pdf_result.get('error') if isinstance(pdf_result, dict) else "Unknown error"
|
1129 |
+
logger.error(f"Failed to create PDF: {error_msg}")
|
1130 |
+
return False
|
1131 |
+
|
1132 |
+
# Extract base64 data
|
1133 |
+
pdf_data = pdf_result.get('data')
|
1134 |
+
if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
|
1135 |
+
logger.error("Invalid PDF data returned")
|
1136 |
+
return False
|
1137 |
|
1138 |
+
# Save the PDF
|
1139 |
try:
|
1140 |
+
base64_data = pdf_data.replace('data:application/pdf;base64,', '')
|
1141 |
pdf_bytes = base64.b64decode(base64_data)
|
1142 |
|
1143 |
with open(save_path, 'wb') as f:
|
1144 |
f.write(pdf_bytes)
|
1145 |
|
1146 |
+
page_count = pdf_result.get('pageCount', 0)
|
1147 |
+
logger.info(f"Successfully saved PDF with {page_count} pages to {save_path}")
|
1148 |
+
|
1149 |
+
# Verify file
|
1150 |
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
|
|
|
1151 |
return True
|
1152 |
else:
|
1153 |
+
logger.error("Generated PDF file is too small or empty")
|
1154 |
+
return False
|
1155 |
except Exception as e:
|
1156 |
+
logger.error(f"Error saving PDF file: {e}")
|
1157 |
+
return False
|
1158 |
+
|
1159 |
+
finally:
|
1160 |
+
await browser.close()
|
1161 |
+
|
1162 |
except Exception as e:
|
1163 |
+
logger.error(f"Error in viewonly PDF download process: {e}")
|
1164 |
+
return False
|
|
|
1165 |
|
1166 |
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
|
1167 |
"""Download any view-only file by taking screenshots"""
|