Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 6

Commit

ed38edb

verified ·

1 Parent(s): b9e60db

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -78

app.py CHANGED Viewed

@@ -352,29 +352,26 @@ class DownloadManager:
                             file_id = match.group(1)
                             break
                     if file_id:
-                        # We'll detect file type during download, so just use the ID for filename initially
                         filename = f"gdrive_{file_id}"
-                        try:
-                            # Get file info to determine type and size
-                            file_type, is_view_only = await self.get_google_drive_file_info(file_id)
-                            if file_type:
-                                filename = f"{filename}.{file_type}"
-                            found_files.append({
-                                'url': href,  # Use original URL, as we'll process it specially
-                                'filename': filename,
-                                'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"),
-                                'metadata': {'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id}
-                            })
-                        except Exception as e:
-                            logger.error(f"Error processing Google Drive link: {e}")
-                            # Fallback if we can't get info
-                            found_files.append({
-                                'url': href,
-                                'filename': filename,
-                                'size': "Unknown Size",
-                                'metadata': {'file_id': file_id}
-                            })
             seen_urls = set()
             unique_files = []
@@ -397,13 +394,33 @@ class DownloadManager:
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
-                # Use enhanced Google Drive downloader
                 success = await self.download_from_google_drive(file_url, path)
-                return path if success else None
             # Original code for non-Google Drive downloads
             async with self.context.new_page() as page:
                 headers = {
@@ -424,6 +441,213 @@ class DownloadManager:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
     async def download_from_google_drive(self, url, save_path):
         """Enhanced method to download from Google Drive with multiple fallback approaches"""
         # Extract the file ID from different URL formats
@@ -531,60 +755,7 @@ class DownloadManager:
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
-        # If all methods failed for view-only file, try one last approach
-        if is_view_only:
-            try:
-                # Try a direct headless browser download
-                async with self.context.new_page() as page:
-                    await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
-                    # Try to capture the content directly from viewer
-                    file_content = await page.evaluate("""
-                        () => {
-                            // Try to find the actual viewer content
-                            const viewerContent = document.querySelector('.drive-viewer-paginated-content');
-                            if (viewerContent) {
-                                return viewerContent.innerHTML;
-                            }
-                            return document.documentElement.innerHTML;
-                        }
-                    """)
-                    if file_content:
-                        # Save as HTML and then we can convert it if needed
-                        html_path = f"{base}.html"
-                        with open(html_path, 'w', encoding='utf-8') as f:
-                            f.write(f"""
-                            <!DOCTYPE html>
-                            <html>
-                            <head><title>Google Drive Extracted Content</title></head>
-                            <body>
-                            {file_content}
-                            </body>
-                            </html>
-                            """)
-                        # If requested a PDF, convert HTML to PDF
-                        if file_type == 'pdf' or ext.lower() == '.pdf':
-                            try:
-                                import pdfkit
-                                pdfkit.from_file(html_path, save_path)
-                                os.remove(html_path)  # Clean up HTML file
-                                return True
-                            except Exception as pdf_err:
-                                logger.warning(f"Error converting HTML to PDF: {pdf_err}")
-                                # Keep the HTML file as fallback
-                                shutil.copy(html_path, save_path)
-                                return True
-                        else:
-                            # Just use the HTML file
-                            shutil.copy(html_path, save_path)
-                            return True
-            except Exception as e:
-                logger.warning(f"Final direct browser capture failed: {e}")
-        # All methods failed
-        logger.error(f"All download approaches failed for Google Drive file: {file_id}")
         return False
     async def get_google_drive_file_info(self, file_id):

                             file_id = match.group(1)
                             break
                     if file_id:
+                        # Get file info to determine type and view-only status
+                        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+                        # Create a more informative filename based on info
                         filename = f"gdrive_{file_id}"
+                        if file_type:
+                            filename = f"{filename}.{file_type}"
+                        size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
+                        found_files.append({
+                            'url': href,  # Use original URL
+                            'filename': filename,
+                            'size': size_str,
+                            'metadata': {
+                                'view_only': is_view_only,
+                                'file_type': file_type,
+                                'file_id': file_id
+                            }
+                        })
             seen_urls = set()
             unique_files = []
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
+                # Check if it's marked as view-only in metadata
+                is_view_only = file_info.get('metadata', {}).get('view_only', False)
+                # For view-only files, try our most robust approach first
+                if is_view_only:
+                    logger.info(f"Attempting to download view-only file: {file_url}")
+                    result_path = await self.force_download_viewonly(file_info, path)
+                    if result_path:
+                        return result_path
+                    # If that failed, try the regular download approach
+                    logger.info("Primary method failed, trying fallback methods")
+                # Try regular download methods
                 success = await self.download_from_google_drive(file_url, path)
+                if success:
+                    return path
+                # If all methods failed for Google Drive, try one last approach
+                logger.warning("All standard methods failed, attempting force download")
+                result_path = await self.force_download_viewonly(file_info, path)
+                return result_path if result_path else None
             # Original code for non-Google Drive downloads
             async with self.context.new_page() as page:
                 headers = {
             logger.error(f"Error downloading {file_url}: {e}")
             return None
+    async def force_download_viewonly(self, file_info, save_path):
+        """Last-resort method to download view-only Google Drive files"""
+        try:
+            # Extract file ID from URL
+            file_id = None
+            url = file_info['url']
+            for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                match = re.search(pattern, url)
+                if match:
+                    file_id = match.group(1)
+                    break
+            if not file_id:
+                logger.error("Could not extract file ID")
+                return None
+            logger.info(f"Force downloading view-only file with ID: {file_id}")
+            # Make sure we have the proper file extension
+            base, ext = os.path.splitext(save_path)
+            if not ext:
+                # Determine file type from metadata or set default to PDF
+                file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
+                save_path = f"{base}.{file_type}"
+            # Launch a new browser context with higher resolution
+            browser = await self.playwright.chromium.launch(
+                headless=True,
+                args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
+            )
+            context = await browser.new_context(
+                viewport={'width': 1600, 'height': 1200},
+                user_agent=get_random_user_agent(),
+                device_scale_factor=2.0  # Higher resolution for better quality
+            )
+            page = await context.new_page()
+            # Navigate to the file
+            try:
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
+                               wait_until='networkidle',
+                               timeout=60000)
+                # Wait for content to load fully
+                await page.wait_for_timeout(5000)
+                # Check if it's a PDF
+                is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
+                if is_pdf:
+                    # For PDFs: Screenshot each page approach
+                    logger.info("Detected PDF, using page-by-page screenshot approach")
+                    # Scroll through document to ensure all pages are loaded
+                    await page.evaluate("""
+                        async function scrollDocument() {
+                            const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                            const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                            if (!container) return;
+                            // First scroll to bottom to load all pages
+                            container.scrollTo(0, container.scrollHeight);
+                            await delay(2000);
+                            // Then back to top
+                            container.scrollTo(0, 0);
+                            await delay(1000);
+                        }
+                        return scrollDocument();
+                    """)
+                    # Count pages
+                    page_count = await page.evaluate("""
+                        () => {
+                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                            return pages.length;
+                        }
+                    """)
+                    if page_count == 0:
+                        logger.warning("No pages found, trying alternative method")
+                        # Take a screenshot of the entire page
+                        temp_dir = tempfile.mkdtemp()
+                        screenshot_path = os.path.join(temp_dir, "page.png")
+                        await page.screenshot(path=screenshot_path, full_page=True)
+                        # Convert screenshot to PDF
+                        from PIL import Image
+                        from reportlab.pdfgen import canvas as pdf_canvas
+                        img = Image.open(screenshot_path)
+                        width, height = img.size
+                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
+                        c.drawImage(screenshot_path, 0, 0, width, height)
+                        c.save()
+                        # Clean up
+                        os.remove(screenshot_path)
+                        os.rmdir(temp_dir)
+                    else:
+                        # Create temp directory for page screenshots
+                        temp_dir = tempfile.mkdtemp()
+                        screenshots = []
+                        # Take screenshot of each page
+                        for i in range(page_count):
+                            # Scroll to page
+                            await page.evaluate(f"""
+                                async () => {{
+                                    const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                                    if (pages.length <= {i}) return;
+                                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                                    pages[{i}].scrollIntoView();
+                                    await delay(500);
+                                }}
+                            """)
+                            # Take screenshot
+                            screenshot_path = os.path.join(temp_dir, f"page_{i+1}.png")
+                            # Position page for best screenshot
+                            await page.evaluate(f"""
+                                () => {{
+                                    const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                                    const page = pages[{i}];
+                                    const viewer = document.querySelector('.drive-viewer-paginated-scrollable');
+                                    if (page && viewer) {{
+                                        // Center the page in the viewport
+                                        const rect = page.getBoundingClientRect();
+                                        viewer.scrollBy(0, rect.top - 100);
+                                    }}
+                                }}
+                            """)
+                            await page.screenshot(path=screenshot_path)
+                            screenshots.append(screenshot_path)
+                        # Combine screenshots into PDF
+                        from reportlab.lib.pagesizes import letter
+                        from reportlab.pdfgen import canvas as pdf_canvas
+                        from PIL import Image
+                        # Use the first image dimensions to determine page size
+                        img = Image.open(screenshots[0])
+                        img_width, img_height = img.size
+                        c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
+                        for screenshot in screenshots:
+                            img = Image.open(screenshot)
+                            c.drawImage(screenshot, 0, 0, img_width, img_height)
+                            c.showPage()
+                        c.save()
+                        # Clean up
+                        for screenshot in screenshots:
+                            os.remove(screenshot)
+                        os.rmdir(temp_dir)
+                else:
+                    # For other file types: Take a single screenshot
+                    temp_dir = tempfile.mkdtemp()
+                    screenshot_path = os.path.join(temp_dir, "screenshot.png")
+                    await page.screenshot(path=screenshot_path, full_page=True)
+                    # Determine final file type
+                    base, ext = os.path.splitext(save_path)
+                    if ext.lower() == '.pdf':
+                        # Convert to PDF
+                        from PIL import Image
+                        from reportlab.pdfgen import canvas as pdf_canvas
+                        img = Image.open(screenshot_path)
+                        width, height = img.size
+                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
+                        c.drawImage(screenshot_path, 0, 0, width, height)
+                        c.save()
+                    else:
+                        # Just copy the screenshot with the appropriate extension
+                        shutil.copy(screenshot_path, save_path)
+                    # Clean up
+                    os.remove(screenshot_path)
+                    os.rmdir(temp_dir)
+                # Close browser
+                await browser.close()
+                # Verify file exists and is not empty
+                if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                    logger.info(f"Successfully downloaded view-only file to {save_path}")
+                    return save_path
+                else:
+                    logger.error(f"Failed to create valid file at {save_path}")
+                    return None
+            except Exception as e:
+                logger.error(f"Error during force download: {e}")
+                if browser:
+                    await browser.close()
+                return None
+        except Exception as e:
+            logger.error(f"Force download failed: {e}")
+            return None
     async def download_from_google_drive(self, url, save_path):
         """Enhanced method to download from Google Drive with multiple fallback approaches"""
         # Extract the file ID from different URL formats
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
+        logger.warning("Standard download methods failed")
         return False
     async def get_google_drive_file_info(self, file_id):