Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 6

Commit

b9e60db

verified ·

1 Parent(s): 0f88c1d

Update app.py

Browse files

Files changed (1) hide show

app.py +644 -212

app.py CHANGED Viewed

@@ -38,6 +38,10 @@ from reportlab.pdfgen import canvas
 from sklearn.cluster import KMeans
 import numpy as np
 import base64
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
@@ -348,23 +352,29 @@ class DownloadManager:
                             file_id = match.group(1)
                             break
                     if file_id:
-                        direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-                        filename = file_id
                         try:
-                            response = await self.page.request.head(direct_url, timeout=15000)
-                            cd = response.headers.get("Content-Disposition", "")
-                            if cd:
-                                mt = re.search(r'filename\*?="?([^";]+)', cd)
-                                if mt:
-                                    filename = mt.group(1).strip('"').strip()
                             found_files.append({
-                                'url': direct_url,
                                 'filename': filename,
-                                'size': await self.get_file_size(direct_url),
-                                'metadata': {}
                             })
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
             seen_urls = set()
             unique_files = []
@@ -388,6 +398,7 @@ class DownloadManager:
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
                 # Use enhanced Google Drive downloader
                 success = await self.download_from_google_drive(file_url, path)
@@ -435,251 +446,672 @@ class DownloadManager:
             logger.error(f"Could not extract file ID from URL: {url}")
             return False
-        # Approach 1: Try with gdown first (when it works)
-        try:
-            import gdown
-            output = gdown.download(url, save_path, quiet=False, fuzzy=True)
-            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                logger.info(f"Successfully downloaded with gdown: {url}")
-                return True
-        except Exception as e:
-            logger.warning(f"gdown download failed: {e}")
-        # Approach 2: Use Playwright session with cookies
-        try:
-            async with self.context.new_page() as page:
-                # Visit the file viewing page to get cookies
-                view_url = f"https://drive.google.com/file/d/{file_id}/view"
-                await page.goto(view_url, wait_until='networkidle', timeout=60000)
-                # Check for view-only permissions
-                if await page.query_selector('text="the owner has not granted you permission to download this file"'):
-                    logger.warning("File has view-only permissions, attempting workaround")
-                    # Check if it's a PDF (we can use the JS method)
-                    is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
-                    if is_pdf:
-                        # Try JavaScript PDF capture approach for PDFs
-                        success = await self.download_viewonly_pdf_with_js(page, save_path)
-                        if success:
-                            return True
-                    # Try direct download attempt for view-only files
-                    cookies = await page.context.cookies()
-                    cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
-                    # Try download URL with custom headers and cookies
-                    download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
-                    await page.goto(download_url, wait_until='networkidle', timeout=60000)
-                    headers = {
-                        'User-Agent': get_random_user_agent(),
-                        'Cookie': cookie_str,
-                        'Accept': '*/*',
-                    }
-                    response = await page.request.get(download_url, headers=headers)
-                    if response.status == 200:
-                        content = await response.body()
-                        with open(save_path, 'wb') as f:
-                            f.write(content)
-                        return True
-                # Standard download flow for files with download permission
-                download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-                await page.goto(download_url, wait_until='networkidle', timeout=60000)
-                # Handle large files with confirmation
-                confirm_form = await page.query_selector('form#download-form')
-                if confirm_form:
-                    await confirm_form.evaluate('form => form.submit()')
-                    await page.wait_for_load_state('networkidle')
-                    # Get cookies after confirmation
-                    cookies = await page.context.cookies()
-                    cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
-                    # Get final download URL with confirmation token
-                    download_url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
-                    response = await page.request.get(download_url, headers={'Cookie': cookie_str})
-                    if response.status == 200:
-                        content = await response.body()
-                        with open(save_path, 'wb') as f:
-                            f.write(content)
                         return True
         except Exception as e:
-            logger.warning(f"Playwright download approach failed: {e}")
-        # Approach 3: Try with requests and session cookies
         try:
-            import requests
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
-            # Get the initial page to obtain cookies
             url = f"https://drive.google.com/uc?id={file_id}&export=download"
             response = session.get(url, stream=True, timeout=30)
-            # Check for the download confirmation
             confirmation_token = None
             for k, v in response.cookies.items():
                 if k.startswith('download_warning'):
                     confirmation_token = v
                     break
-            # Use the confirmation token if found
             if confirmation_token:
-                url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm={confirmation_token}"
-            # Download the file
-            response = session.get(url, stream=True, timeout=60)
-            with open(save_path, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=1024*1024):
-                    if chunk:
-                        f.write(chunk)
-            if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                return True
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
-        # All approaches failed
-        logger.error(f"All download attempts failed for: {url}")
         return False
-    async def download_viewonly_pdf_with_js(self, page, save_path):
-        """Use JavaScript approach to download view-only PDFs from Google Drive"""
         try:
-            logger.info("Attempting to download view-only PDF using JavaScript method")
-            # Scroll to ensure all pages are loaded
-            await page.evaluate("""
-                async function scrollToBottom() {
-                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                    const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                    if (!container) return;
-                    const scrollHeight = container.scrollHeight;
-                    const viewportHeight = container.clientHeight;
-                    const scrollStep = viewportHeight / 2;
-                    for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
-                        container.scrollTo(0, scrollPos);
                         await delay(500);
                     }
-                    // Final scroll to ensure we reached the bottom
-                    container.scrollTo(0, scrollHeight);
-                    await delay(1000);
-                }
-                return scrollToBottom();
-            """)
-            # Wait for a moment to ensure all images are loaded
-            await page.wait_for_timeout(3000)
-            # Inject the jsPDF library
-            await page.evaluate("""
-                return new Promise((resolve, reject) => {
-                    const script = document.createElement('script');
-                    script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
-                    script.onload = () => resolve(true);
-                    script.onerror = () => reject(new Error('Failed to load jsPDF'));
-                    document.head.appendChild(script);
-                });
-            """)
-            # Wait for the library to load
-            await page.wait_for_timeout(1000)
-            # Execute the PDF creation script
-            pdf_data = await page.evaluate("""
-                return new Promise(async (resolve) => {
-                    // Make sure jsPDF is loaded
-                    if (typeof window.jspdf === 'undefined') {
-                        window.jspdf = window.jspdf || {};
                     }
-                    // Use the jsPDF library
-                    const { jsPDF } = window.jspdf;
-                    const pdf = new jsPDF();
-                    const images = Array.from(document.querySelectorAll('img')).filter(img =>
-                        img.src.startsWith('blob:') && img.width > 100 && img.height > 100
-                    );
-                    if (images.length === 0) {
-                        resolve(null);
-                        return;
                     }
-                    for (let i = 0; i < images.length; i++) {
-                        const img = images[i];
-                        // Create canvas and draw image
-                        const canvas = document.createElement('canvas');
-                        canvas.width = img.width;
-                        canvas.height = img.height;
-                        const ctx = canvas.getContext('2d');
-                        ctx.drawImage(img, 0, 0, img.width, img.height);
-                        // Add image to PDF
-                        const imgData = canvas.toDataURL('image/jpeg', 1.0);
-                        // Add a new page for each image except the first one
-                        if (i > 0) {
-                            pdf.addPage();
                         }
-                        // Calculate dimensions to fit page
-                        const pageWidth = pdf.internal.pageSize.getWidth();
-                        const pageHeight = pdf.internal.pageSize.getHeight();
-                        const imgRatio = img.height / img.width;
-                        let imgWidth = pageWidth;
-                        let imgHeight = imgWidth * imgRatio;
-                        // If height exceeds page, scale down
-                        if (imgHeight > pageHeight) {
-                            imgHeight = pageHeight;
-                            imgWidth = imgHeight / imgRatio;
-                        }
-                        // Center image on page
-                        const x = (pageWidth - imgWidth) / 2;
-                        const y = (pageHeight - imgHeight) / 2;
-                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
-                    }
-                    // Get the PDF as base64
-                    const pdfBase64 = pdf.output('datauristring');
-                    resolve(pdfBase64);
-                });
-            """)
-            if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
-                logger.warning("Failed to generate PDF with JavaScript method")
-                return False
-            # Extract the base64 data and save to file
-            base64_data = pdf_data.replace('data:application/pdf;base64,', '')
-            pdf_bytes = base64.b64decode(base64_data)
-            with open(save_path, 'wb') as f:
-                f.write(pdf_bytes)
-            if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                logger.info("Successfully downloaded view-only PDF using JavaScript method")
-                return True
-            else:
-                return False
         except Exception as e:
-            logger.error(f"Error in JavaScript PDF download method: {e}")
             return False
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):

 from sklearn.cluster import KMeans
 import numpy as np
 import base64
+import shutil
+from PIL import Image  # Make sure to pip install Pillow
+from reportlab.pdfgen import canvas
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
                             file_id = match.group(1)
                             break
                     if file_id:
+                        # We'll detect file type during download, so just use the ID for filename initially
+                        filename = f"gdrive_{file_id}"
                         try:
+                            # Get file info to determine type and size
+                            file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+                            if file_type:
+                                filename = f"{filename}.{file_type}"
                             found_files.append({
+                                'url': href,  # Use original URL, as we'll process it specially
                                 'filename': filename,
+                                'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"),
+                                'metadata': {'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id}
                             })
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
+                            # Fallback if we can't get info
+                            found_files.append({
+                                'url': href,
+                                'filename': filename,
+                                'size': "Unknown Size",
+                                'metadata': {'file_id': file_id}
+                            })
             seen_urls = set()
             unique_files = []
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
+            # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
                 # Use enhanced Google Drive downloader
                 success = await self.download_from_google_drive(file_url, path)
             logger.error(f"Could not extract file ID from URL: {url}")
             return False
+        # Determine file type first (important for handling different file types)
+        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+        logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
+        base, ext = os.path.splitext(save_path)
+        if not ext and file_type:
+            # Add the correct extension if missing
+            save_path = f"{base}.{file_type}"
+        # For view-only files, use specialized approaches
+        if is_view_only:
+            # Approach 1: For PDFs, use the JS method
+            if file_type == 'pdf':
+                success = await self.download_viewonly_pdf_with_js(file_id, save_path)
+                if success:
+                    return True
+            # Approach 2: For Google Docs, Sheets, etc., use export API
+            if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
+                success = await self.export_google_doc(file_id, file_type, save_path)
+                if success:
+                    return True
+            # Approach 3: Try the direct screenshot method for any view-only file
+            success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
+            if success:
+                return True
+        # Try standard approaches for non-view-only files
+        try:
+            # Try with gdown first
+            import gdown
+            output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
+            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                with open(save_path, 'rb') as f:
+                    content = f.read(100)  # Read first 100 bytes
+                    if b'<!DOCTYPE html>' not in content:  # Check not HTML error page
+                        logger.info(f"Successfully downloaded with gdown: {url}")
                         return True
         except Exception as e:
+            logger.warning(f"gdown download failed: {e}")
+        # Try with requests and session cookies
         try:
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
+            # Visit the page first to get cookies
+            session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
+            # Try download
             url = f"https://drive.google.com/uc?id={file_id}&export=download"
             response = session.get(url, stream=True, timeout=30)
+            # Check for confirmation token
             confirmation_token = None
             for k, v in response.cookies.items():
                 if k.startswith('download_warning'):
                     confirmation_token = v
                     break
+            # Use confirmation token if found
             if confirmation_token:
+                url = f"{url}&confirm={confirmation_token}"
+                response = session.get(url, stream=True, timeout=60)
+            # Check if we're getting HTML instead of the file
+            content_type = response.headers.get('Content-Type', '')
+            if 'text/html' in content_type:
+                logger.warning("Received HTML instead of file - likely download restriction")
+            else:
+                with open(save_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=1024*1024):
+                        if chunk:
+                            f.write(chunk)
+                if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                    with open(save_path, 'rb') as f:
+                        content = f.read(100)
+                        if b'<!DOCTYPE html>' not in content:
+                            logger.info("Successfully downloaded with requests session")
+                            return True
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
+        # If all methods failed for view-only file, try one last approach
+        if is_view_only:
+            try:
+                # Try a direct headless browser download
+                async with self.context.new_page() as page:
+                    await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
+                    # Try to capture the content directly from viewer
+                    file_content = await page.evaluate("""
+                        () => {
+                            // Try to find the actual viewer content
+                            const viewerContent = document.querySelector('.drive-viewer-paginated-content');
+                            if (viewerContent) {
+                                return viewerContent.innerHTML;
+                            }
+                            return document.documentElement.innerHTML;
+                        }
+                    """)
+                    if file_content:
+                        # Save as HTML and then we can convert it if needed
+                        html_path = f"{base}.html"
+                        with open(html_path, 'w', encoding='utf-8') as f:
+                            f.write(f"""
+                            <!DOCTYPE html>
+                            <html>
+                            <head><title>Google Drive Extracted Content</title></head>
+                            <body>
+                            {file_content}
+                            </body>
+                            </html>
+                            """)
+                        # If requested a PDF, convert HTML to PDF
+                        if file_type == 'pdf' or ext.lower() == '.pdf':
+                            try:
+                                import pdfkit
+                                pdfkit.from_file(html_path, save_path)
+                                os.remove(html_path)  # Clean up HTML file
+                                return True
+                            except Exception as pdf_err:
+                                logger.warning(f"Error converting HTML to PDF: {pdf_err}")
+                                # Keep the HTML file as fallback
+                                shutil.copy(html_path, save_path)
+                                return True
+                        else:
+                            # Just use the HTML file
+                            shutil.copy(html_path, save_path)
+                            return True
+            except Exception as e:
+                logger.warning(f"Final direct browser capture failed: {e}")
+        # All methods failed
+        logger.error(f"All download approaches failed for Google Drive file: {file_id}")
         return False
+    async def get_google_drive_file_info(self, file_id):
+        """Get file type and view-only status from Google Drive"""
+        file_type = None
+        is_view_only = False
         try:
+            async with self.context.new_page() as page:
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+                # Check if view-only
+                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
+                is_view_only = view_only_text is not None
+                # Check for Google Docs viewer
+                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
+                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
+                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
+                if gdocs_viewer:
+                    file_type = 'docx'
+                elif gsheets_viewer:
+                    file_type = 'xlsx'
+                elif gslides_viewer:
+                    file_type = 'pptx'
+                else:
+                    # Check for PDF viewer
+                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
+                    if pdf_viewer:
+                        file_type = 'pdf'
+                    else:
+                        # Check for image viewer
+                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
+                        if img_viewer:
+                            # Get image type from src
+                            img_src = await img_viewer.get_attribute('src')
+                            if 'jpg' in img_src or 'jpeg' in img_src:
+                                file_type = 'jpg'
+                            elif 'png' in img_src:
+                                file_type = 'png'
+                            else:
+                                file_type = 'jpg'  # Default to jpg
+                        else:
+                            # Generic file type fallback
+                            file_type = 'pdf'  # Default to PDF
+                # If still no type, check filename
+                if not file_type:
+                    title_element = await page.query_selector('div[role="heading"]')
+                    if title_element:
+                        title = await title_element.text_content()
+                        if title:
+                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
+                            if ext_match:
+                                file_type = ext_match.group(1).lower()
+        except Exception as e:
+            logger.error(f"Error getting Google Drive file info: {e}")
+            file_type = 'pdf'  # Default to PDF if we can't determine
+        return file_type, is_view_only
+    async def download_viewonly_pdf_with_js(self, file_id, save_path):
+        """Download view-only PDF using JavaScript approach - improved version"""
+        try:
+            async with self.context.new_page() as page:
+                # Set viewport size to ensure we capture full pages
+                await page.set_viewport_size({"width": 1200, "height": 1600})
+                # Visit the file
+                view_url = f"https://drive.google.com/file/d/{file_id}/view"
+                await page.goto(view_url, wait_until='networkidle', timeout=60000)
+                # Wait for rendering
+                await page.wait_for_timeout(2000)
+                # Inject required libraries - use CDN for jsPDF
+                await page.evaluate("""
+                    async function injectLibraries() {
+                        // Add jsPDF
+                        return new Promise((resolve) => {
+                            const jspdfScript = document.createElement('script');
+                            jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
+                            jspdfScript.onload = () => resolve(true);
+                            document.head.appendChild(jspdfScript);
+                        });
+                    }
+                    return injectLibraries();
+                """)
+                # Wait for libraries to load
+                await page.wait_for_timeout(2000)
+                # Scroll through document to load all pages
+                await page.evaluate("""
+                    async function scrollThroughDocument() {
+                        const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                        const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                        if (!container) return false;
+                        const scrollHeight = container.scrollHeight;
+                        const viewportHeight = container.clientHeight;
+                        const scrollStep = viewportHeight / 2;
+                        for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
+                            container.scrollTo(0, scrollPos);
+                            await delay(500);
+                        }
+                        // One final scroll to bottom to ensure everything is loaded
+                        container.scrollTo(0, scrollHeight);
+                        await delay(1000);
+                        // Scroll back to top for PDF creation
+                        container.scrollTo(0, 0);
                         await delay(500);
+                        return true;
                     }
+                    return scrollThroughDocument();
+                """)
+                # Wait after scrolling
+                await page.wait_for_timeout(2000)
+                # Use the improved PDF creation script that captures all pages
+                pdf_base64 = await page.evaluate("""
+                    async function createPDF() {
+                        try {
+                            // Make sure jsPDF is loaded
+                            if (typeof window.jspdf === 'undefined') {
+                                console.error('jsPDF not loaded');
+                                return null;
+                            }
+                            const { jsPDF } = window.jspdf;
+                            const pdf = new jsPDF();
+                            // Get all page elements
+                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                            console.log('Found pages:', pages.length);
+                            if (pages.length === 0) {
+                                // Alternative: try to find images directly
+                                const images = Array.from(document.querySelectorAll('img')).filter(img =>
+                                    img.src.startsWith('blob:') && img.width > 100 && img.height > 100
+                                );
+                                console.log('Found images:', images.length);
+                                if (images.length === 0) {
+                                    return null;
+                                }
+                                // Process each image
+                                for (let i = 0; i < images.length; i++) {
+                                    const img = images[i];
+                                    if (i > 0) {
+                                        pdf.addPage();
+                                    }
+                                    // Create canvas and draw image
+                                    const canvas = document.createElement('canvas');
+                                    canvas.width = img.width;
+                                    canvas.height = img.height;
+                                    const ctx = canvas.getContext('2d');
+                                    ctx.drawImage(img, 0, 0, img.width, img.height);
+                                    // Add to PDF
+                                    const imgData = canvas.toDataURL('image/jpeg', 0.95);
+                                    // Calculate dimensions
+                                    const pageWidth = pdf.internal.pageSize.getWidth();
+                                    const pageHeight = pdf.internal.pageSize.getHeight();
+                                    const imgRatio = img.height / img.width;
+                                    let imgWidth = pageWidth - 10;
+                                    let imgHeight = imgWidth * imgRatio;
+                                    if (imgHeight > pageHeight - 10) {
+                                        imgHeight = pageHeight - 10;
+                                        imgWidth = imgHeight / imgRatio;
+                                    }
+                                    // Center on page
+                                    const x = (pageWidth - imgWidth) / 2;
+                                    const y = (pageHeight - imgHeight) / 2;
+                                    pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
+                                }
+                            } else {
+                                // Process each page
+                                const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                                for (let i = 0; i < pages.length; i++) {
+                                    // Add a new page for each page after the first
+                                    if (i > 0) {
+                                        pdf.addPage();
+                                    }
+                                    // Scroll to the page and wait for it to render
+                                    pages[i].scrollIntoView();
+                                    await delay(300);
+                                    // Find the image element inside the page
+                                    const pageImages = pages[i].querySelectorAll('img');
+                                    let targetImage = null;
+                                    for (const img of pageImages) {
+                                        if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
+                                            targetImage = img;
+                                            break;
+                                        }
+                                    }
+                                    if (!targetImage) {
+                                        // If no image found, try taking a screenshot of the page instead
+                                        const pageCanvas = document.createElement('canvas');
+                                        pageCanvas.width = pages[i].clientWidth;
+                                        pageCanvas.height = pages[i].clientHeight;
+                                        const ctx = pageCanvas.getContext('2d');
+                                        // Draw the page background
+                                        ctx.fillStyle = 'white';
+                                        ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
+                                        // Use html2canvas approach
+                                        try {
+                                            await delay(100);
+                                            // Just draw what we can see
+                                            const allElements = pages[i].querySelectorAll('*');
+                                            for (const el of allElements) {
+                                                if (el.tagName === 'IMG' && el.complete && el.src) {
+                                                    const rect = el.getBoundingClientRect();
+                                                    try {
+                                                        ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
+                                                    } catch (e) {
+                                                        console.error('Draw error:', e);
+                                                    }
+                                                }
+                                            }
+                                        } catch (e) {
+                                            console.error('Canvas error:', e);
+                                        }
+                                        // Add the canvas to the PDF
+                                        const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
+                                        // Calculate dimensions
+                                        const pageWidth = pdf.internal.pageSize.getWidth();
+                                        const pageHeight = pdf.internal.pageSize.getHeight();
+                                        const imgRatio = pageCanvas.height / pageCanvas.width;
+                                        let imgWidth = pageWidth - 10;
+                                        let imgHeight = imgWidth * imgRatio;
+                                        if (imgHeight > pageHeight - 10) {
+                                            imgHeight = pageHeight - 10;
+                                            imgWidth = imgHeight / imgRatio;
+                                        }
+                                        // Center on page
+                                        const x = (pageWidth - imgWidth) / 2;
+                                        const y = (pageHeight - imgHeight) / 2;
+                                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
+                                    } else {
+                                        // Use the found image
+                                        const canvas = document.createElement('canvas');
+                                        canvas.width = targetImage.naturalWidth || targetImage.width;
+                                        canvas.height = targetImage.naturalHeight || targetImage.height;
+                                        const ctx = canvas.getContext('2d');
+                                        // Draw image to canvas
+                                        try {
+                                            ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
+                                        } catch (e) {
+                                            console.error('Error drawing image:', e);
+                                            continue;
+                                        }
+                                        // Add to PDF
+                                        const imgData = canvas.toDataURL('image/jpeg', 0.95);
+                                        // Calculate dimensions
+                                        const pageWidth = pdf.internal.pageSize.getWidth();
+                                        const pageHeight = pdf.internal.pageSize.getHeight();
+                                        const imgRatio = canvas.height / canvas.width;
+                                        let imgWidth = pageWidth - 10;
+                                        let imgHeight = imgWidth * imgRatio;
+                                        if (imgHeight > pageHeight - 10) {
+                                            imgHeight = pageHeight - 10;
+                                            imgWidth = imgHeight / imgRatio;
+                                        }
+                                        // Center on page
+                                        const x = (pageWidth - imgWidth) / 2;
+                                        const y = (pageHeight - imgHeight) / 2;
+                                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
+                                    }
+                                }
+                            }
+                            // Return as base64
+                            return pdf.output('datauristring');
+                        } catch (e) {
+                            console.error('PDF creation error:', e);
+                            return null;
+                        }
                     }
+                    return createPDF();
+                """)
+                if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
+                    # If script method failed, try screenshot approach
+                    logger.warning("PDF creation script failed, trying fallback method")
+                    return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
+                # Save the PDF from base64
+                try:
+                    base64_data = pdf_base64.replace('data:application/pdf;base64,', '')
+                    pdf_bytes = base64.b64decode(base64_data)
+                    with open(save_path, 'wb') as f:
+                        f.write(pdf_bytes)
+                    # Verify file is not empty
+                    if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
+                        logger.info(f"Successfully saved PDF to {save_path}")
+                        return True
+                    else:
+                        logger.warning(f"Generated PDF is too small, using fallback method")
+                        return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
+                except Exception as e:
+                    logger.error(f"Error saving PDF: {e}")
+                    return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
+        except Exception as e:
+            logger.error(f"Error in view-only PDF download: {e}")
+            # Try fallback method
+            return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
+    async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
+        """Download any view-only file by taking screenshots"""
+        try:
+            async with self.context.new_page() as page:
+                # Set high-resolution viewport
+                await page.set_viewport_size({"width": 1600, "height": 1200})
+                # Navigate to the file
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
+                # Make sure the file is loaded
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(3000)  # Extra time for rendering
+                # Create directory for screenshots if multiple pages
+                base_dir = os.path.dirname(save_path)
+                base_name = os.path.splitext(os.path.basename(save_path))[0]
+                screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
+                os.makedirs(screenshots_dir, exist_ok=True)
+                # Check if it's a multi-page document
+                is_multi_page = await page.evaluate("""
+                    () => {
+                        const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                        return pages.length > 1;
                     }
+                """)
+                if is_multi_page and file_type == 'pdf':
+                    # For multi-page PDFs, take screenshots of each page
+                    page_count = await page.evaluate("""
+                        async () => {
+                            const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                            const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                            if (!container || pages.length === 0) return 0;
+                            // Scroll through to make sure all pages are loaded
+                            const scrollHeight = container.scrollHeight;
+                            const viewportHeight = container.clientHeight;
+                            const scrollStep = viewportHeight;
+                            for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
+                                container.scrollTo(0, scrollPos);
+                                await delay(300);
+                            }
+                            // Scroll back to top
+                            container.scrollTo(0, 0);
+                            await delay(300);
+                            return pages.length;
                         }
+                    """)
+                    logger.info(f"Found {page_count} pages in document")
+                    # Take screenshots of each page
+                    screenshots = []
+                    for i in range(page_count):
+                        # Scroll to page
+                        await page.evaluate(f"""
+                            async () => {{
+                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                                if (pages.length <= {i}) return false;
+                                pages[{i}].scrollIntoView();
+                                await delay(500);
+                                return true;
+                            }}
+                        """)
+                        # Take screenshot
+                        screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
+                        await page.screenshot(path=screenshot_path, clip={
+                            'x': 0,
+                            'y': 0,
+                            'width': 1600,
+                            'height': 1200
+                        })
+                        screenshots.append(screenshot_path)
+                    # Combine screenshots into PDF
+                    from PIL import Image
+                    from reportlab.pdfgen import canvas
+                    c = canvas.Canvas(save_path)
+                    for screenshot in screenshots:
+                        img = Image.open(screenshot)
+                        width, height = img.size
+                        # Add page to PDF
+                        c.setPageSize((width, height))
+                        c.drawImage(screenshot, 0, 0, width, height)
+                        c.showPage()
+                    c.save()
+                    # Clean up screenshots
+                    for screenshot in screenshots:
+                        os.remove(screenshot)
+                    os.rmdir(screenshots_dir)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                else:
+                    # For single-page or non-PDF files, just take one screenshot
+                    screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
+                    await page.screenshot(path=screenshot_path, fullPage=True)
+                    # Convert to requested format if needed
+                    if file_type == 'pdf':
+                        from PIL import Image
+                        from reportlab.pdfgen import canvas
+                        # Create PDF from screenshot
+                        img = Image.open(screenshot_path)
+                        width, height = img.size
+                        c = canvas.Canvas(save_path, pagesize=(width, height))
+                        c.drawImage(screenshot_path, 0, 0, width, height)
+                        c.save()
+                    else:
+                        # Just copy the screenshot to the destination with proper extension
+                        shutil.copy(screenshot_path, save_path)
+                    # Clean up
+                    os.remove(screenshot_path)
+                    os.rmdir(screenshots_dir)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+        except Exception as e:
+            logger.error(f"Error taking screenshots: {e}")
+            return False
+    async def export_google_doc(self, file_id, file_type, save_path):
+        """Export Google Docs/Sheets/Slides to downloadable formats"""
+        try:
+            # Map file types to export formats
+            export_formats = {
+                'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # docx
+                'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',  # xlsx
+                'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # pptx
+                'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'pdf': 'application/pdf',
+            }
+            export_format = export_formats.get(file_type, 'application/pdf')
+            export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
+            if 'sheet' in file_type or 'xlsx' in file_type:
+                export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
+            elif 'ppt' in file_type or 'presentation' in file_type:
+                export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
+            elif file_type == 'pdf':
+                export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
+            async with self.context.new_page() as page:
+                # Get cookies from the main view page first
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
+                # Now try the export
+                response = await page.goto(export_url, wait_until='networkidle')
+                if response.status == 200:
+                    content = await response.body()
+                    with open(save_path, 'wb') as f:
+                        f.write(content)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                else:
+                    logger.warning(f"Export failed with status {response.status}")
+                    return False
         except Exception as e:
+            logger.error(f"Error exporting Google Doc: {e}")
             return False
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):