Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 6

Commit

b9d5bbe

verified ·

1 Parent(s): dca120b

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -300

app.py CHANGED Viewed

@@ -442,377 +442,326 @@ class DownloadManager:
             return None
     async def force_download_viewonly(self, file_info, save_path):
-        """Last-resort method to download view-only Google Drive files - improved for multi-page PDFs"""
         try:
-            # Extract file ID from URL
-            file_id = None
-            url = file_info['url']
-            for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
-                match = re.search(pattern, url)
-                if match:
-                    file_id = match.group(1)
-                    break
             if not file_id:
                 logger.error("Could not extract file ID")
                 return None
-            logger.info(f"Force downloading view-only file with ID: {file_id}")
-            # Make sure we have the proper file extension
             base, ext = os.path.splitext(save_path)
             if not ext:
-                # Determine file type from metadata or set default to PDF
-                file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
                 save_path = f"{base}.{file_type}"
-            # Launch a new browser context with higher resolution
             browser = await self.playwright.chromium.launch(
                 headless=True,
-                args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security']
             )
-            # Use a larger viewport for better quality
             context = await browser.new_context(
-                viewport={'width': 1920, 'height': 1080},
-                user_agent=get_random_user_agent(),
-                device_scale_factor=2.0  # Higher resolution for better quality
             )
             page = await context.new_page()
-            # Navigate to the file
             try:
-                logger.info(f"Opening view-only file: https://drive.google.com/file/d/{file_id}/view")
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view",
-                               wait_until='networkidle',
-                               timeout=90000)  # Longer timeout for large PDFs
-                # Wait for content to load fully
-                await page.wait_for_timeout(5000)
-                # Detect if it's a PDF
-                is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
-                if is_pdf:
-                    # For PDFs: Multi-page capture approach
-                    logger.info("Detected PDF, using multi-page capture approach")
-                    # First, try to find the viewer container
-                    viewer_container = await page.query_selector('.drive-viewer-paginated-scrollable')
-                    if not viewer_container:
-                        logger.warning("Could not find standard PDF viewer container, trying alternatives")
-                        viewer_container = await page.query_selector('.drive-viewer-content') or \
-                                          await page.query_selector('#drive-pdf-viewer') or \
-                                          await page.query_selector('.drive-viewer')
-                    if not viewer_container:
-                        # Take a single screenshot as fallback
-                        logger.warning("Could not find any PDF viewer container, using fallback")
-                        screenshot_path = os.path.join(tempfile.gettempdir(), "gdrive_pdf_fallback.png")
-                        await page.screenshot(path=screenshot_path, full_page=True)
                         # Convert to PDF
                         from PIL import Image
                         from reportlab.pdfgen import canvas as pdf_canvas
                         img = Image.open(screenshot_path)
                         width, height = img.size
                         c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
                         c.drawImage(screenshot_path, 0, 0, width, height)
                         c.save()
                         os.remove(screenshot_path)
-                        return save_path
-                    # Scroll through to load all pages first
-                    logger.info("Pre-loading all PDF pages...")
-                    await page.evaluate("""
-                        async function preloadAllPages() {
-                            const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                            const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                            if (!container) return;
-                            // Scroll to bottom to force all pages to load
-                            const initialScroll = container.scrollTop;
-                            container.scrollTo(0, container.scrollHeight);
-                            await delay(3000); // Wait for loading
-                            // Scroll back to top
-                            container.scrollTo(0, 0);
-                            await delay(1000);
-                        }
-                        return preloadAllPages();
-                    """)
-                    # Count visible pages - critical step that needs to be fixed
-                    page_count = await page.evaluate("""
-                        () => {
-                            // Try multiple selectors for pages
-                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                            if (pages.length > 0) return pages.length;
-                            // Alternative selectors if standard one fails
-                            const altPages = document.querySelectorAll('.drive-viewer-page');
-                            if (altPages.length > 0) return altPages.length;
-                            // Try to find page numbers in navigation
-                            const pageNav = document.querySelector('.drive-viewer-paginated-counter');
-                            if (pageNav) {
-                                const text = pageNav.textContent || '';
-                                const match = text.match(/(\d+)\s*\/\s*(\d+)/);
-                                if (match && match[2]) return parseInt(match[2]);
-                            }
-                            return 0; // Fallback
                         }
                     """)
-                    # If no pages found but we know it's a PDF, manually check for page counter
-                    if page_count == 0:
-                        # Try to find the page counter text and extract total pages
-                        page_counter_text = await page.evaluate("""
-                            () => {
-                                const elements = Array.from(document.querySelectorAll('*'));
-                                for (const el of elements) {
-                                    const text = el.textContent || '';
-                                    if (text.match(/\d+\s*\/\s*\d+/)) return text;
-                                }
-                                return '';
-                            }
-                        """)
-                        if page_counter_text:
-                            match = re.search(r'(\d+)\s*\/\s*(\d+)', page_counter_text)
-                            if match and match.group(2):
-                                page_count = int(match.group(2))
-                                logger.info(f"Detected {page_count} pages from page counter")
-                    # If we still have no page count, default to a reasonable number
-                    if page_count == 0:
-                        logger.warning("Could not detect page count, defaulting to 50 pages to be safe")
-                        page_count = 50  # Try to capture up to 50 pages by default
-                    logger.info(f"Found {page_count} pages in PDF")
-                    # Create a temporary directory for screenshots
-                    temp_dir = tempfile.mkdtemp()
-                    screenshots = []
-                    # Function to scroll to a specific page and take a screenshot
-                    async def capture_page(page_num):
-                        # Scroll to the page
-                        success = await page.evaluate(f"""
-                            async function scrollToPage(pageNum) {{
-                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                                // Try multiple selectors for pages
-                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                                if (pages.length > 0 && pageNum < pages.length) {{
-                                    pages[pageNum].scrollIntoView({{behavior: 'instant', block: 'center'}});
-                                    await delay(500);
-                                    return true;
-                                }}
-                                // Alternative: try to use page navigation buttons
-                                const pageInput = document.querySelector('input[aria-label="Page"]');
-                                if (pageInput) {{
-                                    // Set page number in input
-                                    const nativeInputValueSetter = Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, "value").set;
-                                    nativeInputValueSetter.call(pageInput, {page_num + 1});
-                                    // Dispatch events
-                                    const ev1 = new Event('input', {{ bubbles: true }});
-                                    const ev2 = new Event('change', {{ bubbles: true }});
-                                    pageInput.dispatchEvent(ev1);
-                                    pageInput.dispatchEvent(ev2);
-                                    // Press Enter to navigate
-                                    const keyEvent = new KeyboardEvent('keydown', {{
-                                        key: 'Enter',
-                                        code: 'Enter',
-                                        keyCode: 13,
-                                        which: 13,
-                                        bubbles: true
-                                    }});
-                                    pageInput.dispatchEvent(keyEvent);
-                                    await delay(1000); // Wait for navigation
-                                    return true;
-                                }}
-                                // Alternative: use page selector dropdown if available
-                                const pageSelector = document.querySelector('.drive-viewer-paginated-page-selector');
-                                if (pageSelector) {{
-                                    pageSelector.click();
-                                    await delay(300);
-                                    // Find and click the specific page option
-                                    const options = document.querySelectorAll('.drive-viewer-paginated-page-selector-option');
-                                    if (options.length > pageNum) {{
-                                        options[pageNum].click();
-                                        await delay(1000);
-                                        return true;
-                                    }}
-                                }}
-                                return false;
-                            }}
-                            return scrollToPage({page_num});
-                        """)
-                        if not success:
-                            # Alternative: Try using the page navigation buttons
-                            logger.info(f"Using alternative navigation for page {page_num + 1}")
-                            # Find navigation buttons
-                            next_button = await page.query_selector('button[aria-label="Next page"]')
-                            prev_button = await page.query_selector('button[aria-label="Previous page"]')
-                            # If we're not on the first page, go back to first page
-                            if page_num == 0 and prev_button:
-                                for _ in range(50):  # Limit to avoid infinite loop
-                                    is_disabled = await prev_button.get_attribute('disabled')
-                                    if is_disabled:
-                                        break
-                                    await prev_button.click()
-                                    await page.wait_for_timeout(300)
-                            # Now navigate forward to desired page
-                            if page_num > 0 and next_button:
-                                for _ in range(page_num):
-                                    await next_button.click()
-                                    await page.wait_for_timeout(500)
-                            # Wait for the page content to load
-                            await page.wait_for_timeout(1000)
-                        # Wait for page to stabilize
-                        await page.wait_for_timeout(500)
-                        # Take the screenshot
-                        screenshot_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
-                        # Determine what to screenshot based on the viewer
-                        current_page_element = await page.evaluate("""
-                            () => {
-                                // First try getting the current visible page
-                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                                for (const page of pages) {
-                                    const rect = page.getBoundingClientRect();
-                                    if (rect.top < window.innerHeight && rect.bottom > 0) {
-                                        return {
-                                            x: Math.max(0, rect.left),
-                                            y: Math.max(0, rect.top),
-                                            width: Math.min(window.innerWidth, rect.width),
-                                            height: Math.min(window.innerHeight, rect.bottom - rect.top)
-                                        };
-                                    }
-                                }
-                                // Fallback: try to find the container
-                                const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                                if (container) {
-                                    const rect = container.getBoundingClientRect();
-                                    return {
-                                        x: Math.max(0, rect.left),
-                                        y: Math.max(0, rect.top),
-                                        width: Math.min(window.innerWidth, rect.width),
-                                        height: Math.min(window.innerHeight, rect.bottom - rect.top)
-                                    };
-                                }
-                                // Last resort: screenshot the visible area
-                                return null;
-                            }
-                        """)
-                        if current_page_element:
-                            # Screenshot the specific page element
-                            await page.screenshot(path=screenshot_path, clip=current_page_element)
                         else:
-                            # Screenshot the entire visible area
-                            await page.screenshot(path=screenshot_path)
-                        return screenshot_path
-                    # Capture all pages
-                    for i in range(page_count):
-                        logger.info(f"Capturing page {i+1} of {page_count}")
-                        screenshot_path = await capture_page(i)
-                        screenshots.append(screenshot_path)
-                        # Add progress indicator
-                        if (i+1) % 5 == 0 or i+1 == page_count:
-                            logger.info(f"Progress: {i+1}/{page_count} pages captured")
-                    # Combine screenshots into a PDF
                     from PIL import Image
                     from reportlab.lib.pagesizes import letter
                     from reportlab.pdfgen import canvas as pdf_canvas
-                    logger.info(f"Combining {len(screenshots)} screenshots into PDF")
-                    # Use the first image dimensions to set PDF size if available
                     if screenshots:
-                        img = Image.open(screenshots[0])
-                        img_width, img_height = img.size
-                        c = pdf_canvas.Canvas(save_path, pagesize=(img_width, img_height))
-                        for screenshot in screenshots:
-                            # Check if file exists and has content
-                            if os.path.exists(screenshot) and os.path.getsize(screenshot) > 0:
-                                img = Image.open(screenshot)
-                                c.drawImage(screenshot, 0, 0, img_width, img_height)
-                                c.showPage()
-                        c.save()
-                    # Clean up screenshots
-                    for screenshot in screenshots:
-                        if os.path.exists(screenshot):
-                            os.remove(screenshot)
-                    os.rmdir(temp_dir)
-                    # Verify the PDF was created successfully
-                    if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                        logger.info(f"Successfully created PDF with {len(screenshots)} pages")
-                        return save_path
                     else:
-                        logger.error("Failed to create PDF from screenshots")
-                        return None
                 else:
-                    # For non-PDF files: take a single screenshot
-                    logger.info("Non-PDF file detected, taking single screenshot")
-                    screenshot_path = os.path.join(tempfile.gettempdir(), "screenshot.png")
-                    await page.screenshot(path=screenshot_path, full_page=True)
-                    # Convert to requested format if needed
-                    if save_path.lower().endswith('.pdf'):
-                        # Convert to PDF
-                        from PIL import Image
-                        from reportlab.pdfgen import canvas as pdf_canvas
-                        img = Image.open(screenshot_path)
-                        width, height = img.size
-                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
-                        c.drawImage(screenshot_path, 0, 0, width, height)
-                        c.save()
                     else:
-                        # Just copy the screenshot with the appropriate extension
                         shutil.copy(screenshot_path, save_path)
-                    # Clean up
                     os.remove(screenshot_path)
                 # Close browser
                 await browser.close()
-                # Verify file exists and is not empty
-                if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
                     logger.info(f"Successfully downloaded file to {save_path}")
                     return save_path
                 else:
-                    logger.error(f"Failed to create valid file at {save_path}")
                     return None
             except Exception as e:
                 logger.error(f"Error during force download: {e}")
                 if browser:
@@ -820,7 +769,7 @@ class DownloadManager:
                 return None
         except Exception as e:
-            logger.error(f"Force download failed: {e}")
             return None
     async def download_from_google_drive(self, url, save_path):

             return None
     async def force_download_viewonly(self, file_info, save_path):
+        """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
         try:
+            # Extract file ID
+            file_id = file_info.get('metadata', {}).get('file_id')
+            if not file_id:
+                url = file_info['url']
+                for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                    match = re.search(pattern, url)
+                    if match:
+                        file_id = match.group(1)
+                        break
             if not file_id:
                 logger.error("Could not extract file ID")
                 return None
+            file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
             base, ext = os.path.splitext(save_path)
             if not ext:
                 save_path = f"{base}.{file_type}"
+            logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
+            # Create a dedicated browser instance with better resolution
             browser = await self.playwright.chromium.launch(
                 headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-web-security',
+                    '--disable-features=IsolateOrigins,site-per-process',
+                    '--disable-site-isolation-trials'
+                ]
             )
+            # Use higher resolution for better quality
             context = await browser.new_context(
+                viewport={'width': 1600, 'height': 1200},
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                device_scale_factor=2.0
             )
             page = await context.new_page()
             try:
+                # Go to the file view page
+                logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(5000)  # Wait longer for everything to load
+                # Create temp directory
+                temp_dir = tempfile.mkdtemp()
+                # Special handling for PDFs
+                if file_type.lower() == 'pdf':
+                    # Check if there's a pagination control
+                    pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
+                    # Try multiple methods to extract total pages
+                    total_pages = await page.evaluate("""
+                    () => {
+                        // Method 1: Check page counter text
+                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                            const text = el.textContent || '';
+                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                        });
+                        if (pageCounters.length > 0) {
+                            const text = pageCounters[0].textContent || '';
+                            const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+                            if (match && match[2]) return parseInt(match[2]);
+                        }
+                        // Method 2: Check actual page elements
+                        const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
+                        if (pageElements.length > 0) return pageElements.length;
+                        // Method 3: Look for page thumbnails
+                        const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
+                        if (thumbnails.length > 0) return thumbnails.length;
+                        // Fallback: conservative guess based on UI
+                        return 50; // Safe default when we can't determine
+                    }
+                    """)
+                    logger.info(f"Detected {total_pages} pages in PDF")
+                    if total_pages <= 1:
+                        # Additional check - sometimes the page count detection fails
+                        # Let's double-check by looking for next/previous buttons
+                        next_button = await page.query_selector('button[aria-label="Next page"]')
+                        if next_button:
+                            disabled = await next_button.get_attribute('disabled')
+                            if not disabled:
+                                logger.info("Found next button that's not disabled, document has multiple pages")
+                                total_pages = 100  # Set a high number, we'll stop when we can't go further
+                    # If we still think it's a single page, use a more direct approach
+                    if total_pages <= 1:
+                        # Single page approach
+                        logger.info("Using single-page capture approach")
+                        # Take a screenshot of the current view (should be the full document or first page)
+                        screenshot_path = os.path.join(temp_dir, "page.png")
+                        # Try to screenshot just the document area if we can find it
+                        document_area = await page.query_selector('.drive-viewer-paginated-page')
+                        if document_area:
+                            await document_area.screenshot(path=screenshot_path)
+                        else:
+                            # Otherwise take a full screenshot
+                            await page.screenshot(path=screenshot_path)
                         # Convert to PDF
                         from PIL import Image
                         from reportlab.pdfgen import canvas as pdf_canvas
                         img = Image.open(screenshot_path)
                         width, height = img.size
                         c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
                         c.drawImage(screenshot_path, 0, 0, width, height)
                         c.save()
                         os.remove(screenshot_path)
+                        os.rmdir(temp_dir)
+                        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                            return save_path
+                        return None
+                    # Multi-page approach
+                    logger.info(f"Using multi-page capture approach for {total_pages} pages")
+                    # CRITICAL: We need to go to the first page first
+                    # Check if we need to reset to first page
+                    current_page_text = await page.evaluate("""
+                    () => {
+                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                            const text = el.textContent || '';
+                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                        });
+                        if (pageCounters.length > 0) {
+                            return pageCounters[0].textContent || '';
                         }
+                        return '';
+                    }
                     """)
+                    current_page = 1
+                    if current_page_text:
+                        match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
+                        if match:
+                            current_page = int(match.group(1))
+                    # If we're not on page 1, go back to first page
+                    if current_page > 1:
+                        logger.info(f"Currently on page {current_page}, navigating back to page 1")
+                        # Look for an input field where we can directly set the page number
+                        page_input = await page.query_selector('input[aria-label="Page"]')
+                        if page_input:
+                            await page_input.fill("1")
+                            await page_input.press("Enter")
+                            await page.wait_for_timeout(1000)
+                        else:
+                            # Use prev button to go back to first page
+                            prev_button = await page.query_selector('button[aria-label="Previous page"]')
+                            if prev_button:
+                                # Keep clicking until we can't anymore
+                                for _ in range(current_page - 1):
+                                    try:
+                                        await prev_button.click()
+                                        await page.wait_for_timeout(500)
+                                    except Exception as e:
+                                        logger.warning(f"Error clicking prev button: {e}")
+                                        break
+                    # Capture each page
+                    screenshots = []
+                    page_num = 1
+                    max_tries = min(total_pages + 10, 200)  # Set a reasonable limit
+                    next_button = await page.query_selector('button[aria-label="Next page"]')
+                    # Maximize the PDF view if possible
+                    await page.evaluate("""
+                    () => {
+                        // Try to find and click any "full page" or "maximize" buttons
+                        const fullViewButtons = Array.from(document.querySelectorAll('button'))
+                                              .filter(b => b.textContent?.includes('Full') ||
+                                                          b.getAttribute('aria-label')?.includes('Full') ||
+                                                          b.getAttribute('aria-label')?.includes('fit page'));
+                        if (fullViewButtons.length > 0) {
+                            fullViewButtons[0].click();
+                        }
+                    }
+                    """)
+                    await page.wait_for_timeout(1000)  # Wait for view to adjust
+                    while page_num <= max_tries:
+                        # Wait for the page to be fully loaded
+                        await page.wait_for_timeout(800)
+                        # Take a screenshot of the current page
+                        screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
+                        # Try different methods to identify and capture just the page content
+                        page_content = await page.query_selector('.drive-viewer-paginated-page')
+                        if page_content:
+                            # Found the specific page element
+                            await page_content.screenshot(path=screenshot_path)
+                        else:
+                            # Fall back to screenshot of visible viewport
+                            await page.screenshot(path=screenshot_path)
+                        screenshots.append(screenshot_path)
+                        logger.info(f"Captured page {page_num}")
+                        # Check if we have a disabled next button (reached the end)
+                        if next_button:
+                            is_disabled = await next_button.get_attribute('disabled')
+                            if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
+                                logger.info(f"Reached end of document after {page_num} pages")
+                                break
+                            # Click the next button
+                            try:
+                                await next_button.click()
+                                await page.wait_for_timeout(800)  # Wait for page transition
+                                page_num += 1
+                            except Exception as e:
+                                logger.error(f"Error clicking next button: {e}")
+                                # Try to get a fresh reference to the button
+                                next_button = await page.query_selector('button[aria-label="Next page"]')
+                                if not next_button:
+                                    logger.warning("Next button disappeared, assuming end of document")
+                                    break
                         else:
+                            # Try to find the next button again
+                            next_button = await page.query_selector('button[aria-label="Next page"]')
+                            if not next_button:
+                                logger.warning("Could not find next button, stopping navigation")
+                                break
+                        # Double-check if we've reached the expected total
+                        if page_num >= total_pages:
+                            logger.info(f"Reached expected total of {total_pages} pages")
+                            break
+                    # Combine screenshots into PDF
+                    logger.info(f"Creating PDF from {len(screenshots)} captured pages")
                     from PIL import Image
                     from reportlab.lib.pagesizes import letter
                     from reportlab.pdfgen import canvas as pdf_canvas
+                    # Use the size of the first screenshot to set PDF dimensions
                     if screenshots:
+                        try:
+                            img = Image.open(screenshots[0])
+                            width, height = img.size
+                            c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
+                            for screenshot in screenshots:
+                                try:
+                                    if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
+                                        img = Image.open(screenshot)
+                                        c.drawImage(screenshot, 0, 0, width, height)
+                                        c.showPage()
+                                except Exception as e:
+                                    logger.error(f"Error adding page to PDF: {e}")
+                            c.save()
+                            # Clean up screenshots
+                            for screenshot in screenshots:
+                                if os.path.exists(screenshot):
+                                    os.remove(screenshot)
+                            logger.info(f"Successfully created PDF with {len(screenshots)} pages")
+                        except Exception as e:
+                            logger.error(f"Error creating PDF: {e}")
                     else:
+                        logger.error("No screenshots captured to create PDF")
                 else:
+                    # Non-PDF file handling
+                    screenshot_path = os.path.join(temp_dir, "file.png")
+                    await page.screenshot(path=screenshot_path)
+                    if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
+                        # For document types, try to export directly
+                        await self.export_google_doc(file_id, file_type, save_path)
                     else:
+                        # For other types, save the screenshot with appropriate extension
                         shutil.copy(screenshot_path, save_path)
                     os.remove(screenshot_path)
+                # Clean up temp directory
+                try:
+                    os.rmdir(temp_dir)
+                except:
+                    pass
                 # Close browser
                 await browser.close()
+                # Verify file exists and has content
+                if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
                     logger.info(f"Successfully downloaded file to {save_path}")
                     return save_path
                 else:
+                    logger.error(f"Generated file is too small or missing: {save_path}")
                     return None
             except Exception as e:
                 logger.error(f"Error during force download: {e}")
                 if browser:
                 return None
         except Exception as e:
+            logger.error(f"Force download preparation failed: {e}")
             return None
     async def download_from_google_drive(self, url, save_path):