Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 8

Commit

82c1030

verified ·

1 Parent(s): 907ffd6

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -131

app.py CHANGED Viewed

@@ -920,177 +920,248 @@ class DownloadManager:
         return False
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
-        """Download view-only PDF using JavaScript approach - improved version"""
         try:
-            async with self.context.new_page() as page:
-                # Set viewport size to ensure we capture full pages
-                await page.set_viewport_size({"width": 1200, "height": 1600})
-                # Visit the file
-                view_url = f"https://drive.google.com/file/d/{file_id}/view"
-                await page.goto(view_url, wait_until='networkidle', timeout=60000)
-                # Wait for initial rendering
-                await page.wait_for_timeout(2000)
-                # CRITICAL: Scroll through entire document to ensure all content is cached
-                await page.evaluate("""
-                    async function scrollThroughDocument() {
                         const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                        const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                        if (!container) return false;
-                        // Get total scroll height
-                        const scrollHeight = container.scrollHeight;
                         const viewportHeight = container.clientHeight;
-                        // Scroll down in increments to load all pages
-                        for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += viewportHeight) {
-                            container.scrollTo(0, scrollPos);
-                            await delay(800); // Wait for content to load
                         }
-                        // One final scroll to bottom to ensure everything is loaded
-                        container.scrollTo(0, scrollHeight);
                         await delay(1500);
                         // Scroll back to top for PDF creation
                         container.scrollTo(0, 0);
-                        await delay(800);
                         return true;
                     }
-                    return scrollThroughDocument();
                 """)
-                # Use simplified script similar to the one provided
-                pdf_base64 = await page.evaluate("""
-                    async function createPDF() {
                         try {
-                            // Create jsPDF script element
-                            const loadJsPDF = () => new Promise((resolve, reject) => {
                                 let jspdf = document.createElement("script");
-                                jspdf.onload = () => resolve();
-                                jspdf.onerror = () => reject(new Error("Failed to load jsPDF"));
-                                jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
-                                document.body.appendChild(jspdf);
-                            });
-                            await loadJsPDF();
-                            // Create PDF
-                            const { jsPDF } = window.jspdf;
-                            let pdf = new jsPDF();
-                            let elements = document.getElementsByTagName("img");
-                            let pageCount = 0;
-                            // First pass to find and sort all valid page images
-                            let pageImages = [];
-                            for (let i = 0; i < elements.length; i++) {
-                                let img = elements[i];
-                                // Only process blob images (these are the PDF pages)
-                                if (!/^blob:/.test(img.src)) continue;
-                                // Skip tiny images (usually icons, not content)
-                                if (img.width < 100 || img.height < 100) continue;
-                                pageImages.push(img);
-                            }
-                            // Sort images by their position if possible
-                            try {
-                                pageImages.sort((a, b) => {
-                                    const rectA = a.getBoundingClientRect();
-                                    const rectB = b.getBoundingClientRect();
-                                    return rectA.top - rectB.top;
-                                });
-                            } catch (e) {
-                                console.error("Error sorting images:", e);
-                            }
-                            // Process each image as a page
-                            for (let i = 0; i < pageImages.length; i++) {
-                                let img = pageImages[i];
-                                // Create canvas to draw the image
-                                let canvasElement = document.createElement('canvas');
-                                let con = canvasElement.getContext("2d");
-                                canvasElement.width = img.width;
-                                canvasElement.height = img.height;
-                                // Draw image to canvas
-                                con.drawImage(img, 0, 0, img.width, img.height);
-                                // Add image to PDF
-                                let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
-                                // Add a new page for each page after the first
-                                if (pageCount > 0) {
-                                    pdf.addPage();
-                                }
-                                // Calculate dimensions to fit the page
-                                const pageWidth = pdf.internal.pageSize.getWidth();
-                                const pageHeight = pdf.internal.pageSize.getHeight();
-                                const imgRatio = img.height / img.width;
-                                let imgWidth = pageWidth;
-                                let imgHeight = imgWidth * imgRatio;
-                                if (imgHeight > pageHeight) {
-                                    imgHeight = pageHeight;
-                                    imgWidth = imgHeight / imgRatio;
-                                }
-                                // Center on page
-                                const x = (pageWidth - imgWidth) / 2;
-                                const y = (pageHeight - imgHeight) / 2;
-                                pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
-                                pageCount++;
-                            }
-                            if (pageCount === 0) {
-                                return null; // No pages found
-                            }
-                            // Return as base64
-                            return pdf.output('datauristring');
                         } catch (e) {
-                            console.error("PDF creation error:", e);
-                            return null;
                         }
                     }
-                    return createPDF();
                 """)
-                if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
-                    logger.warning("PDF creation script failed, trying fallback method")
-                    return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
-                # Save the PDF from base64
                 try:
-                    base64_data = pdf_base64.replace('data:application/pdf;base64,', '')
                     pdf_bytes = base64.b64decode(base64_data)
                     with open(save_path, 'wb') as f:
                         f.write(pdf_bytes)
-                    # Verify file is not empty
                     if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
-                        logger.info(f"Successfully saved PDF to {save_path}")
                         return True
                     else:
-                        logger.warning(f"Generated PDF is too small, using fallback method")
-                        return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
                 except Exception as e:
-                    logger.error(f"Error saving PDF: {e}")
-                    return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
         except Exception as e:
-            logger.error(f"Error in view-only PDF download: {e}")
-            # Try fallback method
-            return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
     async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
         """Download any view-only file by taking screenshots"""

         return False
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
+        """Improved method that replicates the manual process for downloading view-only PDFs"""
         try:
+            # Create a fresh browser context with extended timeout
+            browser = await self.playwright.chromium.launch(
+                headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-web-security'
+                ]
+            )
+            # Use high DPI for better quality
+            context = await browser.new_context(
+                viewport={'width': 1600, 'height': 1200},
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                device_scale_factor=2.0,
+                timeout=120000  # Longer timeout
+            )
+            page = await context.new_page()
+            try:
+                logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
+                # Step 1: Navigate to the PDF and wait for it to load fully
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(3000)  # Additional wait for JavaScript to initialize
+                # Check if we have a PDF viewer
+                viewer_loaded = await page.query_selector('.drive-viewer-paginated-scrollable, .drive-viewer-paginated-page')
+                if not viewer_loaded:
+                    logger.warning("PDF viewer not detected. This might not be a PDF or might be using a different viewer.")
+                    # Continue anyway, as it might just be a different CSS class
+                # Step 2: Scroll through the entire document to ensure all pages are loaded
+                logger.info("Scrolling through document to load all pages into cache...")
+                # This is CRITICAL - scroll all the way down to ensure all pages are loaded and cached
+                scroll_success = await page.evaluate("""
+                    async function scrollThroughEntireDocument() {
                         const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                        // Try multiple container selectors that might exist in Google Drive
+                        const container = document.querySelector('.drive-viewer-paginated-scrollable') ||
+                                          document.querySelector('.drive-viewer-container');
+                        if (!container) {
+                            console.log('No scroll container found');
+                            return false;
+                        }
+                        // Get total height to scroll
+                        const totalHeight = container.scrollHeight;
                         const viewportHeight = container.clientHeight;
+                        console.log(`Document height: ${totalHeight}px, Viewport: ${viewportHeight}px`);
+                        // First scroll quickly to the bottom to trigger loading all content
+                        container.scrollTo(0, totalHeight);
+                        await delay(2000);
+                        // Then scroll gradually to ensure everything is properly loaded
+                        const scrollSteps = 20; // Number of steps to divide the scroll
+                        const stepSize = totalHeight / scrollSteps;
+                        // Scroll down in steps
+                        for (let i = 0; i < scrollSteps; i++) {
+                            const targetPos = i * stepSize;
+                            container.scrollTo(0, targetPos);
+                            console.log(`Scrolled to ${targetPos}px`);
+                            await delay(300); // Wait between scrolls
                         }
+                        // Final scroll to the very bottom
+                        container.scrollTo(0, totalHeight);
                         await delay(1500);
                         // Scroll back to top for PDF creation
                         container.scrollTo(0, 0);
+                        await delay(1000);
                         return true;
                     }
+                    return scrollThroughEntireDocument();
                 """)
+                if not scroll_success:
+                    logger.warning("Scrolling may not have completed successfully. Will try to download anyway.")
+                # Step 3: Wait to ensure all content is properly loaded after scrolling
+                await page.wait_for_timeout(2000)
+                # Step 4: Execute the jsPDF script, similar to the manual process
+                logger.info("Executing jsPDF script to create and download PDF...")
+                pdf_result = await page.evaluate("""
+                    async function downloadPDFWithJsPDF() {
                         try {
+                            // Create and load jsPDF script
+                            return new Promise((resolve, reject) => {
                                 let jspdf = document.createElement("script");
+                                jspdf.onload = function() {
+                                    try {
+                                        // This is the core PDF creation logic
+                                        let pdf = new jsPDF();
+                                        let elements = document.getElementsByTagName("img");
+                                        let pageCount = 0;
+                                        // First collect and sort the images
+                                        let validImages = [];
+                                        for (let i = 0; i < elements.length; i++) {
+                                            let img = elements[i];
+                                            // Only include blob images (PDF page images)
+                                            if (!/^blob:/.test(img.src)) {
+                                                continue;
+                                            }
+                                            // Exclude small images (usually icons)
+                                            if (img.width < 100 || img.height < 100) {
+                                                continue;
+                                            }
+                                            validImages.push(img);
+                                        }
+                                        // Sort by position from top to bottom
+                                        validImages.sort((a, b) => {
+                                            let rectA = a.getBoundingClientRect();
+                                            let rectB = b.getBoundingClientRect();
+                                            return rectA.top - rectB.top;
+                                        });
+                                        console.log(`Found ${validImages.length} valid page images`);
+                                        if (validImages.length === 0) {
+                                            reject("No valid PDF page images found");
+                                            return;
+                                        }
+                                        // Process each image
+                                        for (let i = 0; i < validImages.length; i++) {
+                                            let img = validImages[i];
+                                            // Create canvas and draw image
+                                            let canvasElement = document.createElement('canvas');
+                                            let con = canvasElement.getContext('2d');
+                                            canvasElement.width = img.width;
+                                            canvasElement.height = img.height;
+                                            try {
+                                                // Draw the image to canvas
+                                                con.drawImage(img, 0, 0, img.width, img.height);
+                                                // Convert to JPEG
+                                                let imgData = canvasElement.toDataURL("image/jpeg", 1.0);
+                                                // Add a new page for each page after the first
+                                                if (pageCount > 0) {
+                                                    pdf.addPage();
+                                                }
+                                                // Add image to PDF
+                                                pdf.addImage(imgData, 'JPEG', 0, 0, pdf.internal.pageSize.getWidth(), pdf.internal.pageSize.getHeight());
+                                                pageCount++;
+                                            } catch (e) {
+                                                console.error("Error processing image:", e);
+                                            }
+                                        }
+                                        if (pageCount === 0) {
+                                            reject("Failed to add any pages to PDF");
+                                            return;
+                                        }
+                                        // Return PDF as data URL
+                                        let pdfOutput = pdf.output('datauristring');
+                                        resolve({
+                                            success: true,
+                                            data: pdfOutput,
+                                            pageCount: pageCount
+                                        });
+                                    } catch (e) {
+                                        console.error("Error in PDF creation:", e);
+                                        reject("Error creating PDF: " + e.message);
+                                    }
+                                };
+                                jspdf.onerror = function() {
+                                    reject("Failed to load jsPDF library");
+                                };
+                                // Use a reliable CDN for jsPDF
+                                jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.3.2/jspdf.min.js';
+                                document.body.appendChild(jspdf);
+                            });
                         } catch (e) {
+                            console.error("Overall error:", e);
+                            return { success: false, error: e.message };
                         }
                     }
+                    return downloadPDFWithJsPDF();
                 """)
+                # Step 5: Process the result
+                if not pdf_result or not isinstance(pdf_result, dict) or not pdf_result.get('success'):
+                    error_msg = pdf_result.get('error') if isinstance(pdf_result, dict) else "Unknown error"
+                    logger.error(f"Failed to create PDF: {error_msg}")
+                    return False
+                # Extract base64 data
+                pdf_data = pdf_result.get('data')
+                if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
+                    logger.error("Invalid PDF data returned")
+                    return False
+                # Save the PDF
                 try:
+                    base64_data = pdf_data.replace('data:application/pdf;base64,', '')
                     pdf_bytes = base64.b64decode(base64_data)
                     with open(save_path, 'wb') as f:
                         f.write(pdf_bytes)
+                    page_count = pdf_result.get('pageCount', 0)
+                    logger.info(f"Successfully saved PDF with {page_count} pages to {save_path}")
+                    # Verify file
                     if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
                         return True
                     else:
+                        logger.error("Generated PDF file is too small or empty")
+                        return False
                 except Exception as e:
+                    logger.error(f"Error saving PDF file: {e}")
+                    return False
+            finally:
+                await browser.close()
         except Exception as e:
+            logger.error(f"Error in viewonly PDF download process: {e}")
+            return False
     async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
         """Download any view-only file by taking screenshots"""