Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 8

Commit

3b03ee1

verified ·

1 Parent(s): 942484e

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -1507

app.py CHANGED Viewed

@@ -284,1525 +284,148 @@ class DownloadManager:
             logger.error(f"Error extracting real download URL: {e}")
             return url
-    async def extract_downloadable_files(self, url, custom_ext_list):
-        found_files = []
-        try:
-            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
-            if not response:
-                return []
-            final_url = self.page.url
-            if '.php' in final_url or 'download' in final_url:
-                real_url = await self.extract_real_download_url(final_url)
-                if real_url != final_url:
-                    found_files.append({
-                        'url': real_url,
-                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                        'size': await self.get_file_size(real_url),
-                        'metadata': {}
-                    })
-                    return found_files
-            await self.page.wait_for_load_state('networkidle', timeout=30000)
-            content = await self.page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
-                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
-                            '.pptx', '.odt', '.txt']
-            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
-            parsed_base = urlparse(final_url)
-            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            path_base = os.path.dirname(parsed_base.path)
-            # Process all anchor tags
-            for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                if '.php' in href.lower() or 'download' in href.lower():
-                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
-                    real_url = await self.extract_real_download_url(full_url)
-                    if real_url and real_url != full_url:
-                        found_files.append({
-                            'url': real_url,
-                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                            'size': await self.get_file_size(real_url),
-                            'metadata': {}
-                        })
-                        continue
-                if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
-                    size_str = await self.get_file_size(file_url)
-                    meta = {}
-                    if file_url.lower().endswith('.pdf'):
-                        meta = await self.get_pdf_metadata(file_url)
-                    found_files.append({
-                        'url': file_url,
-                        'filename': os.path.basename(file_url.split('?')[0]),
-                        'size': size_str,
-                        'metadata': meta
-                    })
-                # Handle Google Drive links
-                elif ("drive.google.com" in href) or ("docs.google.com" in href):
-                    file_id = None
-                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
-                        match = re.search(pattern, href)
-                        if match:
-                            file_id = match.group(1)
-                            break
-                    if file_id:
-                        # Get file info to determine type and view-only status
-                        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
-                        # Create a more informative filename based on info
-                        filename = f"gdrive_{file_id}"
-                        if file_type:
-                            filename = f"{filename}.{file_type}"
-                        size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
-                        found_files.append({
-                            'url': href,  # Use original URL
-                            'filename': filename,
-                            'size': size_str,
-                            'metadata': {
-                                'view_only': is_view_only,
-                                'file_type': file_type,
-                                'file_id': file_id
-                            }
-                        })
-            # Also check for files in other elements (iframe, embed, object, etc.)
-            other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
-            for elem in other_elements:
-                src = elem.get('src') or elem.get('data')
-                if src and any(src.lower().endswith(ext) for ext in all_exts):
-                    file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
-                    size_str = await self.get_file_size(file_url)
-                    meta = {}
-                    if file_url.lower().endswith('.pdf'):
-                        meta = await self.get_pdf_metadata(file_url)
-                    found_files.append({
-                        'url': file_url,
-                        'filename': os.path.basename(file_url.split('?')[0]),
-                        'size': size_str,
-                        'metadata': meta
-                    })
-            # Check for file links in onclick attributes
-            onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
-            for elem in onclick_elements:
-                onclick = await elem.get_attribute('onclick')
-                urls = re.findall(r'(https?://[^\'"]+)', onclick)
-                for url_match in urls:
-                    if any(url_match.lower().endswith(ext) for ext in all_exts):
-                        size_str = await self.get_file_size(url_match)
-                        meta = {}
-                        if url_match.lower().endswith('.pdf'):
-                            meta = await self.get_pdf_metadata(url_match)
-                        found_files.append({
-                            'url': url_match,
-                            'filename': os.path.basename(url_match.split('?')[0]),
-                            'size': size_str,
-                            'metadata': meta
-                        })
-            seen_urls = set()
-            unique_files = []
-            for f in found_files:
-                if f['url'] not in seen_urls:
-                    seen_urls.add(f['url'])
-                    unique_files.append(f)
-            return unique_files
-        except Exception as e:
-            logger.error(f"Error extracting files from {url}: {e}")
-            return []
-    async def download_file(self, file_info, save_dir, referer):
-        file_url = file_info['url']
-        fname = file_info['filename']
-        path = os.path.join(save_dir, fname)
-        base, ext = os.path.splitext(fname)
-        counter = 1
-        while os.path.exists(path):
-            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-            counter += 1
-        os.makedirs(save_dir, exist_ok=True)
-        try:
-            # Special handling for Google Drive files
-            if "drive.google.com" in file_url or "docs.google.com" in file_url:
-                # Check if it's marked as view-only in metadata
-                is_view_only = file_info.get('metadata', {}).get('view_only', False)
-                # For view-only files, try our most robust approach first
-                if is_view_only:
-                    logger.info(f"Attempting to download view-only file: {file_url}")
-                    result_path = await self.force_download_viewonly(file_info, path)
-                    if result_path:
-                        return result_path
-                    # If that failed, try the regular download approach
-                    logger.info("Primary method failed, trying fallback methods")
-                # Try regular download methods
-                success = await self.download_from_google_drive(file_url, path)
-                if success:
-                    return path
-                # If all methods failed for Google Drive, try one last approach
-                logger.warning("All standard methods failed, attempting force download")
-                result_path = await self.force_download_viewonly(file_info, path)
-                return result_path if result_path else None
-            # Original code for non-Google Drive downloads
-            async with self.context.new_page() as page:
-                headers = {
-                    'Accept': '*/*',
-                    'Accept-Encoding': 'gzip, deflate, br',
-                    'Referer': referer
-                }
-                response = await page.request.get(file_url, headers=headers, timeout=30000)
-                if response.status == 200:
-                    content = await response.body()
-                    with open(path, 'wb') as f:
-                        f.write(content)
-                    return path
-                else:
-                    logger.error(f"Download failed with status {response.status}: {file_url}")
-                    return None
-        except Exception as e:
-            logger.error(f"Error downloading {file_url}: {e}")
-            return None
-    async def force_download_viewonly(self, file_info, save_path):
-        """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
-        try:
-            # Extract file ID
-            file_id = file_info.get('metadata', {}).get('file_id')
-            if not file_id:
-                url = file_info['url']
-                for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
-                    match = re.search(pattern, url)
-                    if match:
-                        file_id = match.group(1)
-                        break
-            if not file_id:
-                logger.error("Could not extract file ID")
-                return None
-            file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
-            base, ext = os.path.splitext(save_path)
-            if not ext:
-                save_path = f"{base}.{file_type}"
-            logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
-            # Create a dedicated browser instance with better resolution
-            browser = await self.playwright.chromium.launch(
-                headless=True,
-                args=[
-                    '--no-sandbox',
-                    '--disable-setuid-sandbox',
-                    '--disable-dev-shm-usage',
-                    '--disable-web-security',
-                    '--disable-features=IsolateOrigins,site-per-process',
-                    '--disable-site-isolation-trials'
-                ]
-            )
-            # Use higher resolution for better quality
-            context = await browser.new_context(
-                viewport={'width': 1600, 'height': 1200},
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                device_scale_factor=2.0
-            )
-            page = await context.new_page()
-            try:
-                # Go to the file view page
-                logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
-                await page.wait_for_load_state('networkidle')
-                await page.wait_for_timeout(5000)  # Wait longer for everything to load
-                # Create temp directory
-                temp_dir = tempfile.mkdtemp()
-                # Special handling for PDFs
-                if file_type.lower() == 'pdf':
-                    # Check if there's a pagination control
-                    pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
-                    # Try multiple methods to extract total pages
-                    total_pages = await page.evaluate("""
-                    () => {
-                        // Method 1: Check page counter text
-                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
-                            const text = el.textContent || '';
-                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
-                        });
-                        if (pageCounters.length > 0) {
-                            const text = pageCounters[0].textContent || '';
-                            const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
-                            if (match && match[2]) return parseInt(match[2]);
-                        }
-                        // Method 2: Check actual page elements
-                        const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
-                        if (pageElements.length > 0) return pageElements.length;
-                        // Method 3: Look for page thumbnails
-                        const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
-                        if (thumbnails.length > 0) return thumbnails.length;
-                        // Fallback: conservative guess based on UI
-                        return 50; // Safe default when we can't determine
-                    }
-                    """)
-                    logger.info(f"Detected {total_pages} pages in PDF")
-                    if total_pages <= 1:
-                        # Additional check - sometimes the page count detection fails
-                        # Let's double-check by looking for next/previous buttons
-                        next_button = await page.query_selector('button[aria-label="Next page"]')
-                        if next_button:
-                            disabled = await next_button.get_attribute('disabled')
-                            if not disabled:
-                                logger.info("Found next button that's not disabled, document has multiple pages")
-                                total_pages = 100  # Set a high number, we'll stop when we can't go further
-                    # If we still think it's a single page, use a more direct approach
-                    if total_pages <= 1:
-                        # Single page approach
-                        logger.info("Using single-page capture approach")
-                        # Take a screenshot of the current view (should be the full document or first page)
-                        screenshot_path = os.path.join(temp_dir, "page.png")
-                        # Try to screenshot just the document area if we can find it
-                        document_area = await page.query_selector('.drive-viewer-paginated-page')
-                        if document_area:
-                            await document_area.screenshot(path=screenshot_path)
-                        else:
-                            # Otherwise take a full screenshot
-                            await page.screenshot(path=screenshot_path)
-                        # Convert to PDF
-                        from PIL import Image
-                        from reportlab.pdfgen import canvas as pdf_canvas
-                        img = Image.open(screenshot_path)
-                        width, height = img.size
-                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
-                        c.drawImage(screenshot_path, 0, 0, width, height)
-                        c.save()
-                        os.remove(screenshot_path)
-                        os.rmdir(temp_dir)
-                        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                            return save_path
-                        return None
-                    # Multi-page approach
-                    logger.info(f"Using multi-page capture approach for {total_pages} pages")
-                    # CRITICAL: We need to go to the first page first
-                    # Check if we need to reset to first page
-                    current_page_text = await page.evaluate("""
-                    () => {
-                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
-                            const text = el.textContent || '';
-                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
-                        });
-                        if (pageCounters.length > 0) {
-                            return pageCounters[0].textContent || '';
-                        }
-                        return '';
-                    }
-                    """)
-                    current_page = 1
-                    if current_page_text:
-                        match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
-                        if match:
-                            current_page = int(match.group(1))
-                    # If we're not on page 1, go back to first page
-                    if current_page > 1:
-                        logger.info(f"Currently on page {current_page}, navigating back to page 1")
-                        # Look for an input field where we can directly set the page number
-                        page_input = await page.query_selector('input[aria-label="Page"]')
-                        if page_input:
-                            await page_input.fill("1")
-                            await page_input.press("Enter")
-                            await page.wait_for_timeout(1000)
-                        else:
-                            # Use prev button to go back to first page
-                            prev_button = await page.query_selector('button[aria-label="Previous page"]')
-                            if prev_button:
-                                # Keep clicking until we can't anymore
-                                for _ in range(current_page - 1):
-                                    try:
-                                        await prev_button.click()
-                                        await page.wait_for_timeout(500)
-                                    except Exception as e:
-                                        logger.warning(f"Error clicking prev button: {e}")
-                                        break
-                    # Capture each page
-                    screenshots = []
-                    page_num = 1
-                    max_tries = min(total_pages + 10, 200)  # Set a reasonable limit
-                    next_button = await page.query_selector('button[aria-label="Next page"]')
-                    # Maximize the PDF view if possible
-                    await page.evaluate("""
-                    () => {
-                        // Try to find and click any "full page" or "maximize" buttons
-                        const fullViewButtons = Array.from(document.querySelectorAll('button'))
-                                              .filter(b => b.textContent?.includes('Full') ||
-                                                          b.getAttribute('aria-label')?.includes('Full') ||
-                                                          b.getAttribute('aria-label')?.includes('fit page'));
-                        if (fullViewButtons.length > 0) {
-                            fullViewButtons[0].click();
-                        }
-                    }
-                    """)
-                    await page.wait_for_timeout(1000)  # Wait for view to adjust
-                    while page_num <= max_tries:
-                        # Wait for the page to be fully loaded
-                        await page.wait_for_timeout(800)
-                        # Take a screenshot of the current page
-                        screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
-                        # Try different methods to identify and capture just the page content
-                        page_content = await page.query_selector('.drive-viewer-paginated-page')
-                        if page_content:
-                            # Found the specific page element
-                            await page_content.screenshot(path=screenshot_path)
-                        else:
-                            # Fall back to screenshot of visible viewport
-                            await page.screenshot(path=screenshot_path)
-                        screenshots.append(screenshot_path)
-                        logger.info(f"Captured page {page_num}")
-                        # Check if we have a disabled next button (reached the end)
-                        if next_button:
-                            is_disabled = await next_button.get_attribute('disabled')
-                            if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
-                                logger.info(f"Reached end of document after {page_num} pages")
-                                break
-                            # Click the next button
-                            try:
-                                await next_button.click()
-                                await page.wait_for_timeout(800)  # Wait for page transition
-                                page_num += 1
-                            except Exception as e:
-                                logger.error(f"Error clicking next button: {e}")
-                                # Try to get a fresh reference to the button
-                                next_button = await page.query_selector('button[aria-label="Next page"]')
-                                if not next_button:
-                                    logger.warning("Next button disappeared, assuming end of document")
-                                    break
-                        else:
-                            # Try to find the next button again
-                            next_button = await page.query_selector('button[aria-label="Next page"]')
-                            if not next_button:
-                                logger.warning("Could not find next button, stopping navigation")
-                                break
-                        # Double-check if we've reached the expected total
-                        if page_num >= total_pages:
-                            logger.info(f"Reached expected total of {total_pages} pages")
-                            break
-                    # Combine screenshots into PDF
-                    logger.info(f"Creating PDF from {len(screenshots)} captured pages")
-                    from PIL import Image
-                    from reportlab.lib.pagesizes import letter
-                    from reportlab.pdfgen import canvas as pdf_canvas
-                    # Use the size of the first screenshot to set PDF dimensions
-                    if screenshots:
-                        try:
-                            img = Image.open(screenshots[0])
-                            width, height = img.size
-                            c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
-                            for screenshot in screenshots:
-                                try:
-                                    if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
-                                        img = Image.open(screenshot)
-                                        c.drawImage(screenshot, 0, 0, width, height)
-                                        c.showPage()
-                                except Exception as e:
-                                    logger.error(f"Error adding page to PDF: {e}")
-                            c.save()
-                            # Clean up screenshots
-                            for screenshot in screenshots:
-                                if os.path.exists(screenshot):
-                                    os.remove(screenshot)
-                            logger.info(f"Successfully created PDF with {len(screenshots)} pages")
-                        except Exception as e:
-                            logger.error(f"Error creating PDF: {e}")
-                    else:
-                        logger.error("No screenshots captured to create PDF")
-                else:
-                    # Non-PDF file handling
-                    screenshot_path = os.path.join(temp_dir, "file.png")
-                    await page.screenshot(path=screenshot_path)
-                    if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
-                        # For document types, try to export directly
-                        await self.export_google_doc(file_id, file_type, save_path)
-                    else:
-                        # For other types, save the screenshot with appropriate extension
-                        shutil.copy(screenshot_path, save_path)
-                    os.remove(screenshot_path)
-                # Clean up temp directory
-                try:
-                    os.rmdir(temp_dir)
-                except:
-                    pass
-                # Close browser
-                await browser.close()
-                # Verify file exists and has content
-                if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
-                    logger.info(f"Successfully downloaded file to {save_path}")
-                    return save_path
-                else:
-                    logger.error(f"Generated file is too small or missing: {save_path}")
-                    return None
-            except Exception as e:
-                logger.error(f"Error during force download: {e}")
-                if browser:
-                    await browser.close()
-                return None
-        except Exception as e:
-            logger.error(f"Force download preparation failed: {e}")
-            return None
-    async def download_from_google_drive(self, url, save_path):
-        """Enhanced method to download from Google Drive with multiple fallback approaches"""
-        # Extract the file ID from different URL formats
-        file_id = None
-        url_patterns = [
-            r'drive\.google\.com/file/d/([^/]+)',
-            r'drive\.google\.com/open\?id=([^&]+)',
-            r'docs\.google\.com/\w+/d/([^/]+)',
-            r'id=([^&]+)',
-            r'drive\.google\.com/uc\?id=([^&]+)',
-        ]
-        for pattern in url_patterns:
-            match = re.search(pattern, url)
-            if match:
-                file_id = match.group(1)
-                break
-        if not file_id:
-            logger.error(f"Could not extract file ID from URL: {url}")
-            return False
-        # Determine file type first (important for handling different file types)
-        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
-        logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
-        base, ext = os.path.splitext(save_path)
-        if not ext and file_type:
-            # Add the correct extension if missing
-            save_path = f"{base}.{file_type}"
-        # For view-only files, use specialized approaches
-        if is_view_only:
-            # Approach 1: For PDFs, use the JS method
-            if file_type == 'pdf':
-                success = await self.download_viewonly_pdf_with_js(file_id, save_path)
-                if success:
-                    return True
-            # Approach 2: For Google Docs, Sheets, etc., use export API
-            if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
-                success = await self.export_google_doc(file_id, file_type, save_path)
-                if success:
-                    return True
-            # Approach 3: Try the direct screenshot method for any view-only file
-            success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
-            if success:
-                return True
-        # Try standard approaches for non-view-only files
-        try:
-            # Try with gdown first
-            import gdown
-            output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
-            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                with open(save_path, 'rb') as f:
-                    content = f.read(100)  # Read first 100 bytes
-                    if b'<!DOCTYPE html>' not in content:  # Check not HTML error page
-                        logger.info(f"Successfully downloaded with gdown: {url}")
-                        return True
-        except Exception as e:
-            logger.warning(f"gdown download failed: {e}")
-        # Try with requests and session cookies
-        try:
-            session = requests.Session()
-            session.headers.update({'User-Agent': get_random_user_agent()})
-            # Visit the page first to get cookies
-            session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
-            # Try download
-            url = f"https://drive.google.com/uc?id={file_id}&export=download"
-            response = session.get(url, stream=True, timeout=30)
-            # Check for confirmation token
-            confirmation_token = None
-            for k, v in response.cookies.items():
-                if k.startswith('download_warning'):
-                    confirmation_token = v
-                    break
-            # Use confirmation token if found
-            if confirmation_token:
-                url = f"{url}&confirm={confirmation_token}"
-                response = session.get(url, stream=True, timeout=60)
-            # Check if we're getting HTML instead of the file
-            content_type = response.headers.get('Content-Type', '')
-            if 'text/html' in content_type:
-                logger.warning("Received HTML instead of file - likely download restriction")
-            else:
-                with open(save_path, 'wb') as f:
-                    for chunk in response.iter_content(chunk_size=1024*1024):
-                        if chunk:
-                            f.write(chunk)
-                if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                    with open(save_path, 'rb') as f:
-                        content = f.read(100)
-                        if b'<!DOCTYPE html>' not in content:
-                            logger.info("Successfully downloaded with requests session")
-                            return True
-        except Exception as e:
-            logger.warning(f"Requests session download failed: {e}")
-        logger.warning("Standard download methods failed")
-        return False
-    async def download_viewonly_pdf_with_js(self, file_id, save_path):
-        """Improved method that replicates the manual process for downloading view-only PDFs"""
-        try:
-            # Create a fresh browser context with extended timeout
-            browser = await self.playwright.chromium.launch(
-                headless=True,
-                args=[
-                    '--no-sandbox',
-                    '--disable-setuid-sandbox',
-                    '--disable-dev-shm-usage',
-                    '--disable-web-security'
-                ]
-            )
-            # Use high DPI for better quality
-            context = await browser.new_context(
-                viewport={'width': 1600, 'height': 1200},
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                device_scale_factor=2.0,
-                timeout=120000  # Longer timeout
-            )
-            page = await context.new_page()
-            try:
-                logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
-                # Step 1: Navigate to the PDF and wait for it to load fully
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
-                await page.wait_for_load_state('networkidle')
-                await page.wait_for_timeout(3000)  # Additional wait for JavaScript to initialize
-                # Check if we have a PDF viewer
-                viewer_loaded = await page.query_selector('.drive-viewer-paginated-scrollable, .drive-viewer-paginated-page')
-                if not viewer_loaded:
-                    logger.warning("PDF viewer not detected. This might not be a PDF or might be using a different viewer.")
-                    # Continue anyway, as it might just be a different CSS class
-                # Step 2: Scroll through the entire document to ensure all pages are loaded
-                logger.info("Scrolling through document to load all pages into cache...")
-                # This is CRITICAL - scroll all the way down to ensure all pages are loaded and cached
-                scroll_success = await page.evaluate("""
-                    async function scrollThroughEntireDocument() {
-                        const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                        // Try multiple container selectors that might exist in Google Drive
-                        const container = document.querySelector('.drive-viewer-paginated-scrollable') ||
-                                          document.querySelector('.drive-viewer-container');
-                        if (!container) {
-                            console.log('No scroll container found');
-                            return false;
-                        }
-                        // Get total height to scroll
-                        const totalHeight = container.scrollHeight;
-                        const viewportHeight = container.clientHeight;
-                        console.log(`Document height: ${totalHeight}px, Viewport: ${viewportHeight}px`);
-                        // First scroll quickly to the bottom to trigger loading all content
-                        container.scrollTo(0, totalHeight);
-                        await delay(2000);
-                        // Then scroll gradually to ensure everything is properly loaded
-                        const scrollSteps = 20; // Number of steps to divide the scroll
-                        const stepSize = totalHeight / scrollSteps;
-                        // Scroll down in steps
-                        for (let i = 0; i < scrollSteps; i++) {
-                            const targetPos = i * stepSize;
-                            container.scrollTo(0, targetPos);
-                            console.log(`Scrolled to ${targetPos}px`);
-                            await delay(300); // Wait between scrolls
-                        }
-                        // Final scroll to the very bottom
-                        container.scrollTo(0, totalHeight);
-                        await delay(1500);
-                        // Scroll back to top for PDF creation
-                        container.scrollTo(0, 0);
-                        await delay(1000);
-                        return true;
-                    }
-                    return scrollThroughEntireDocument();
-                """)
-                if not scroll_success:
-                    logger.warning("Scrolling may not have completed successfully. Will try to download anyway.")
-                # Step 3: Wait to ensure all content is properly loaded after scrolling
-                await page.wait_for_timeout(2000)
-                # Step 4: Execute the jsPDF script, similar to the manual process
-                logger.info("Executing jsPDF script to create and download PDF...")
-                pdf_result = await page.evaluate("""
-                    async function downloadPDFWithJsPDF() {
-                        try {
-                            // Create and load jsPDF script
-                            return new Promise((resolve, reject) => {
-                                let jspdf = document.createElement("script");
-                                jspdf.onload = function() {
-                                    try {
-                                        // This is the core PDF creation logic
-                                        let pdf = new jsPDF();
-                                        let elements = document.getElementsByTagName("img");
-                                        let pageCount = 0;
-                                        // First collect and sort the images
-                                        let validImages = [];
-                                        for (let i = 0; i < elements.length; i++) {
-                                            let img = elements[i];
-                                            // Only include blob images (PDF page images)
-                                            if (!/^blob:/.test(img.src)) {
-                                                continue;
-                                            }
-                                            // Exclude small images (usually icons)
-                                            if (img.width < 100 || img.height < 100) {
-                                                continue;
-                                            }
-                                            validImages.push(img);
-                                        }
-                                        // Sort by position from top to bottom
-                                        validImages.sort((a, b) => {
-                                            let rectA = a.getBoundingClientRect();
-                                            let rectB = b.getBoundingClientRect();
-                                            return rectA.top - rectB.top;
-                                        });
-                                        console.log(`Found ${validImages.length} valid page images`);
-                                        if (validImages.length === 0) {
-                                            reject("No valid PDF page images found");
-                                            return;
-                                        }
-                                        // Process each image
-                                        for (let i = 0; i < validImages.length; i++) {
-                                            let img = validImages[i];
-                                            // Create canvas and draw image
-                                            let canvasElement = document.createElement('canvas');
-                                            let con = canvasElement.getContext('2d');
-                                            canvasElement.width = img.width;
-                                            canvasElement.height = img.height;
-                                            try {
-                                                // Draw the image to canvas
-                                                con.drawImage(img, 0, 0, img.width, img.height);
-                                                // Convert to JPEG
-                                                let imgData = canvasElement.toDataURL("image/jpeg", 1.0);
-                                                // Add a new page for each page after the first
-                                                if (pageCount > 0) {
-                                                    pdf.addPage();
-                                                }
-                                                // Add image to PDF
-                                                pdf.addImage(imgData, 'JPEG', 0, 0, pdf.internal.pageSize.getWidth(), pdf.internal.pageSize.getHeight());
-                                                pageCount++;
-                                            } catch (e) {
-                                                console.error("Error processing image:", e);
-                                            }
-                                        }
-                                        if (pageCount === 0) {
-                                            reject("Failed to add any pages to PDF");
-                                            return;
-                                        }
-                                        // Return PDF as data URL
-                                        let pdfOutput = pdf.output('datauristring');
-                                        resolve({
-                                            success: true,
-                                            data: pdfOutput,
-                                            pageCount: pageCount
-                                        });
-                                    } catch (e) {
-                                        console.error("Error in PDF creation:", e);
-                                        reject("Error creating PDF: " + e.message);
-                                    }
-                                };
-                                jspdf.onerror = function() {
-                                    reject("Failed to load jsPDF library");
-                                };
-                                // Use a reliable CDN for jsPDF
-                                jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.3.2/jspdf.min.js';
-                                document.body.appendChild(jspdf);
-                            });
-                        } catch (e) {
-                            console.error("Overall error:", e);
-                            return { success: false, error: e.message };
-                        }
-                    }
-                    return downloadPDFWithJsPDF();
-                """)
-                # Step 5: Process the result
-                if not pdf_result or not isinstance(pdf_result, dict) or not pdf_result.get('success'):
-                    error_msg = pdf_result.get('error') if isinstance(pdf_result, dict) else "Unknown error"
-                    logger.error(f"Failed to create PDF: {error_msg}")
-                    return False
-                # Extract base64 data
-                pdf_data = pdf_result.get('data')
-                if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
-                    logger.error("Invalid PDF data returned")
-                    return False
-                # Save the PDF
-                try:
-                    base64_data = pdf_data.replace('data:application/pdf;base64,', '')
-                    pdf_bytes = base64.b64decode(base64_data)
-                    with open(save_path, 'wb') as f:
-                        f.write(pdf_bytes)
-                    page_count = pdf_result.get('pageCount', 0)
-                    logger.info(f"Successfully saved PDF with {page_count} pages to {save_path}")
-                    # Verify file
-                    if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
-                        return True
-                    else:
-                        logger.error("Generated PDF file is too small or empty")
-                        return False
-                except Exception as e:
-                    logger.error(f"Error saving PDF file: {e}")
-                    return False
-            finally:
-                await browser.close()
-        except Exception as e:
-            logger.error(f"Error in viewonly PDF download process: {e}")
-            return False
-    async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
-        """Download any view-only file by taking screenshots"""
-        try:
-            async with self.context.new_page() as page:
-                # Set high-resolution viewport
-                await page.set_viewport_size({"width": 1600, "height": 1200})
-                # Navigate to the file
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
-                # Make sure the file is loaded
-                await page.wait_for_load_state('networkidle')
-                await page.wait_for_timeout(3000)  # Extra time for rendering
-                # Create directory for screenshots if multiple pages
-                base_dir = os.path.dirname(save_path)
-                base_name = os.path.splitext(os.path.basename(save_path))[0]
-                screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
-                os.makedirs(screenshots_dir, exist_ok=True)
-                # Check if it's a multi-page document
-                is_multi_page = await page.evaluate("""
-                    () => {
-                        const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                        return pages.length > 1;
-                    }
-                """)
-                if is_multi_page and file_type == 'pdf':
-                    # For multi-page PDFs, take screenshots of each page
-                    page_count = await page.evaluate("""
-                        async () => {
-                            const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                            const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                            if (!container || pages.length === 0) return 0;
-                            // Scroll through to make sure all pages are loaded
-                            const scrollHeight = container.scrollHeight;
-                            const viewportHeight = container.clientHeight;
-                            const scrollStep = viewportHeight;
-                            for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
-                                container.scrollTo(0, scrollPos);
-                                await delay(300);
-                            }
-                            // Scroll back to top
-                            container.scrollTo(0, 0);
-                            await delay(300);
-                            return pages.length;
-                        }
-                    """)
-                    logger.info(f"Found {page_count} pages in document")
-                    # Take screenshots of each page
-                    screenshots = []
-                    for i in range(page_count):
-                        # Scroll to page
-                        await page.evaluate(f"""
-                            async () => {{
-                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                                if (pages.length <= {i}) return false;
-                                pages[{i}].scrollIntoView();
-                                await delay(500);
-                                return true;
-                            }}
-                        """)
-                        # Take screenshot
-                        screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
-                        await page.screenshot(path=screenshot_path, clip={
-                            'x': 0,
-                            'y': 0,
-                            'width': 1600,
-                            'height': 1200
-                        })
-                        screenshots.append(screenshot_path)
-                    # Combine screenshots into PDF
-                    from PIL import Image
-                    from reportlab.pdfgen import canvas
-                    c = canvas.Canvas(save_path)
-                    for screenshot in screenshots:
-                        img = Image.open(screenshot)
-                        width, height = img.size
-                        # Add page to PDF
-                        c.setPageSize((width, height))
-                        c.drawImage(screenshot, 0, 0, width, height)
-                        c.showPage()
-                    c.save()
-                    # Clean up screenshots
-                    for screenshot in screenshots:
-                        os.remove(screenshot)
-                    os.rmdir(screenshots_dir)
-                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
-                else:
-                    # For single-page or non-PDF files, just take one screenshot
-                    screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
-                    await page.screenshot(path=screenshot_path, fullPage=True)
-                    # Convert to requested format if needed
-                    if file_type == 'pdf':
-                        from PIL import Image
-                        from reportlab.pdfgen import canvas
-                        # Create PDF from screenshot
-                        img = Image.open(screenshot_path)
-                        width, height = img.size
-                        c = canvas.Canvas(save_path, pagesize=(width, height))
-                        c.drawImage(screenshot_path, 0, 0, width, height)
-                        c.save()
-                    else:
-                        # Just copy the screenshot to the destination with proper extension
-                        shutil.copy(screenshot_path, save_path)
-                    # Clean up
-                    os.remove(screenshot_path)
-                    os.rmdir(screenshots_dir)
-                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
-        except Exception as e:
-            logger.error(f"Error taking screenshots: {e}")
-            return False
-    async def export_google_doc(self, file_id, file_type, save_path):
-        """Export Google Docs/Sheets/Slides to downloadable formats"""
-        try:
-            # Map file types to export formats
-            export_formats = {
-                'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # docx
-                'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-                'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',  # xlsx
-                'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-                'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # pptx
-                'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-                'pdf': 'application/pdf',
-            }
-            export_format = export_formats.get(file_type, 'application/pdf')
-            export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
-            if 'sheet' in file_type or 'xlsx' in file_type:
-                export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
-            elif 'ppt' in file_type or 'presentation' in file_type:
-                export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
-            elif file_type == 'pdf':
-                export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
-            async with self.context.new_page() as page:
-                # Get cookies from the main view page first
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
-                # Now try the export
-                response = await page.goto(export_url, wait_until='networkidle')
-                if response.status == 200:
-                    content = await response.body()
-                    with open(save_path, 'wb') as f:
-                        f.write(content)
-                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
-                else:
-                    logger.warning(f"Export failed with status {response.status}")
-                    return False
-        except Exception as e:
-            logger.error(f"Error exporting Google Doc: {e}")
-            return False
-    async def get_google_drive_file_info(self, file_id):
-        """Get file type and view-only status from Google Drive"""
-        file_type = None
-        is_view_only = False
-        try:
-            async with self.context.new_page() as page:
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
-                # Check if view-only
-                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
-                is_view_only = view_only_text is not None
-                # Check for Google Docs viewer
-                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
-                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
-                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
-                if gdocs_viewer:
-                    file_type = 'docx'
-                elif gsheets_viewer:
-                    file_type = 'xlsx'
-                elif gslides_viewer:
-                    file_type = 'pptx'
-                else:
-                    # Check for PDF viewer
-                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
-                    if pdf_viewer:
-                        file_type = 'pdf'
-                    else:
-                        # Check for image viewer
-                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
-                        if img_viewer:
-                            # Get image type from src
-                            img_src = await img_viewer.get_attribute('src')
-                            if 'jpg' in img_src or 'jpeg' in img_src:
-                                file_type = 'jpg'
-                            elif 'png' in img_src:
-                                file_type = 'png'
-                            else:
-                                file_type = 'jpg'  # Default to jpg
-                        else:
-                            # Generic file type fallback
-                            file_type = 'pdf'  # Default to PDF
-                # If still no type, check filename
-                if not file_type:
-                    title_element = await page.query_selector('div[role="heading"]')
-                    if title_element:
-                        title = await title_element.text_content()
-                        if title:
-                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
-                            if ext_match:
-                                file_type = ext_match.group(1).lower()
-        except Exception as e:
-            logger.error(f"Error getting Google Drive file info: {e}")
-            file_type = 'pdf'  # Default to PDF if we can't determine
-        return file_type, is_view_only
-    async def get_sublinks(self, url, limit=10000):
-        """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
-        links = set()
-        try:
-            logger.info(f"Fetching sublinks from: {url}")
-            # Go to page and wait for full load
-            await self.page.goto(url, timeout=30000, wait_until='networkidle')
-            # Get base URL for resolving relative links
-            parsed_base = urlparse(url)
-            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            path_base = os.path.dirname(parsed_base.path)
-            # Check if page has ASP.NET elements which might need special handling
-            is_aspnet = await self.page.evaluate('''
-                () => {
-                    return document.querySelector('form#aspnetForm') !== null ||
-                           document.querySelector('input[name="__VIEWSTATE"]') !== null;
-                }
-            ''')
-            if is_aspnet:
-                logger.info("Detected ASP.NET page, using enhanced extraction method")
-                # Try to interact with ASP.NET controls that might reveal more links
-                # Look for dropdowns, buttons, and grid elements
-                dropdowns = await self.page.query_selector_all('select')
-                buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
-                # Try interacting with dropdowns first
-                for dropdown in dropdowns:
-                    try:
-                        # Get all options
-                        options = await self.page.evaluate('''
-                            (dropdown) => {
-                                return Array.from(dropdown.options).map(o => o.value);
-                            }
-                        ''', dropdown)
-                        # Try selecting each option
-                        for option in options:
-                            if option:
-                                await dropdown.select_option(value=option)
-                                await self.page.wait_for_timeout(1000)
-                                await self.page.wait_for_load_state('networkidle', timeout=5000)
-                                # Extract any new links that appeared
-                                await self.extract_all_link_types(links, base_url, path_base)
-                    except Exception as e:
-                        logger.warning(f"Error interacting with dropdown: {e}")
-                # Try clicking buttons (but avoid dangerous ones like "delete")
-                safe_buttons = []
-                for button in buttons:
-                    button_text = await button.text_content() or ""
-                    button_value = await button.get_attribute("value") or ""
-                    button_id = await button.get_attribute("id") or ""
-                    combined_text = (button_text + button_value + button_id).lower()
-                    # Skip potentially destructive buttons
-                    if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
-                        continue
-                    # Prioritize buttons that might show more content
-                    if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
-                        safe_buttons.append(button)
-                # Click the safe buttons
-                for button in safe_buttons[:5]:  # Limit to first 5 to avoid too many clicks
-                    try:
-                        await button.click()
-                        await self.page.wait_for_timeout(1000)
-                        await self.page.wait_for_load_state('networkidle', timeout=5000)
-                        # Extract any new links that appeared
-                        await self.extract_all_link_types(links, base_url, path_base)
-                    except Exception as e:
-                        logger.warning(f"Error clicking button: {e}")
-            # Extract links from the initial page state
-            await self.extract_all_link_types(links, base_url, path_base)
-            # Look specifically for links inside grid/table views which are common in ASP.NET applications
-            grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
-            for cell in grid_cells:
-                try:
-                    href = await cell.get_attribute('href')
-                    if href:
-                        full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
-                        links.add(full_url)
-                except Exception as e:
-                    logger.warning(f"Error extracting grid link: {e}")
-            # Extract links from onclick attributes and javascript:__doPostBack calls
-            postback_links = await self.page.evaluate('''
-                () => {
-                    const results = [];
-                    // Find elements with onclick containing __doPostBack
-                    const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
-                    for (const el of elements) {
-                        // Extract the postback target
-                        const onclick = el.getAttribute('onclick') || '';
-                        const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
-                        if (match && match[1]) {
-                            // Get the visible text to use as description
-                            const text = el.innerText || el.textContent || 'Link';
-                            results.push({
-                                id: match[1],
-                                text: text.trim()
-                            });
-                        }
-                    }
-                    return results;
-                }
-            ''')
-            # Try interacting with some of the postback links
-            for postback in postback_links[:10]:  # Limit to first 10 to avoid too many interactions
-                try:
-                    logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
-                    await self.page.evaluate(f'''
-                        () => {{
-                            if (typeof __doPostBack === 'function') {{
-                                __doPostBack('{postback["id"]}', '');
-                            }}
-                        }}
-                    ''')
-                    await self.page.wait_for_timeout(1500)
-                    await self.page.wait_for_load_state('networkidle', timeout=5000)
-                    # Extract any new links that appeared
-                    await self.extract_all_link_types(links, base_url, path_base)
-                except Exception as e:
-                    logger.warning(f"Error with postback: {e}")
-            logger.info(f"Found {len(links)} sublinks")
-            return list(links)[:limit]
-        except Exception as e:
-            logger.error(f"Error getting sublinks from {url}: {e}")
-            return list(links)[:limit]  # Return what we have so far
-    async def extract_all_link_types(self, links_set, base_url, path_base):
-        """Extract all types of links from the current page"""
-        # Get all <a> tag links
-        a_links = await self.page.query_selector_all('a[href]')
-        for a in a_links:
-            try:
-                href = await a.get_attribute('href')
-                if href and not href.startswith('javascript:') and not href.startswith('#'):
-                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
-                    links_set.add(full_url)
-            except Exception:
-                pass
-        # Get iframe sources
-        iframes = await self.page.query_selector_all('iframe[src]')
-        for iframe in iframes:
-            try:
-                src = await iframe.get_attribute('src')
-                if src and not src.startswith('javascript:') and not src.startswith('about:'):
-                    full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
-                    links_set.add(full_url)
-            except Exception:
-                pass
-        # Get links from onclick attributes that reference URLs
-        onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
-        for el in onclick_elements:
-            try:
-                onclick = await el.get_attribute('onclick')
-                urls = re.findall(r'(https?://[^\'"]+)', onclick)
-                for url in urls:
-                    links_set.add(url)
-            except Exception:
-                pass
-        # Look for URLs in data-* attributes
-        data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
-        for el in data_elements:
-            for attr in ['data-url', 'data-href', 'data-src']:
-                try:
-                    value = await el.get_attribute(attr)
-                    if value and not value.startswith('javascript:'):
-                        full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
-                        links_set.add(full_url)
-                except Exception:
-                    pass
-        # Look for special anchor links that might not have href attributes
-        special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
-        for anchor in special_anchors:
-            try:
-                href = await anchor.get_attribute('href')
-                if href and not href.startswith('javascript:') and not href.startswith('#'):
-                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
-                    links_set.add(full_url)
-            except Exception:
-                pass
-    def resolve_relative_url(self, relative_url, base_url, path_base):
-        """Properly resolve relative URLs considering multiple formats"""
-        if relative_url.startswith('/'):
-            # Absolute path relative to domain
-            return f"{base_url}{relative_url}"
-        elif relative_url.startswith('./'):
-            # Explicit relative path
-            return f"{base_url}{path_base}/{relative_url[2:]}"
-        elif relative_url.startswith('../'):
-            # Parent directory
-            parent_path = '/'.join(path_base.split('/')[:-1])
-            return f"{base_url}{parent_path}/{relative_url[3:]}"
-        else:
-            # Regular relative path
-            return f"{base_url}{path_base}/{relative_url}"
-    async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
-        if not custom_ext_list:
-            custom_ext_list = []
-        progress_text = st.empty()
-        progress_bar = st.progress(0)
-        file_count_text = st.empty()
         try:
-            progress_text.text("Analyzing main page...")
-            # Special handling for ASP.NET pages
-            is_aspnet = False
-            try:
-                await self.page.goto(url, timeout=30000, wait_until='networkidle')
-                is_aspnet = await self.page.evaluate('''
-                    () => {
-                        return document.querySelector('form#aspnetForm') !== null ||
-                               document.querySelector('input[name="__VIEWSTATE"]') !== null;
-                    }
-                ''')
-            except Exception:
-                pass
-            # Extract files from main page
-            main_files = await self.extract_downloadable_files(url, custom_ext_list)
-            initial_count = len(main_files)
-            file_count_text.text(f"Found {initial_count} files on main page")
-            # Get sublinks with enhanced method
-            progress_text.text("Getting sublinks...")
-            sublinks = await self.get_sublinks(url, sublink_limit)
-            total_links = len(sublinks)
-            progress_text.text(f"Found {total_links} sublinks to process")
-            if not sublinks:
-                progress_bar.progress(1.0)
-                return main_files
-            # Process each sublink
-            all_files = main_files
-            for i, sublink in enumerate(sublinks, 1):
-                progress = i / total_links
-                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
-                progress_bar.progress(progress)
-                try:
-                    # Use a longer timeout for ASP.NET pages which can be slower
-                    sub_timeout = timeout * 2 if is_aspnet else timeout
-                    # Extract files from sublink with appropriate timeout
-                    async with async_timeout(sub_timeout):
-                        sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
-                        all_files.extend(sub_files)
-                        file_count_text.text(f"Found {len(all_files)} total files")
-                except Exception as e:
-                    logger.warning(f"Error processing sublink {sublink}: {e}")
-            # Deduplicate files
-            seen_urls = set()
-            unique_files = []
-            for f in all_files:
-                if f['url'] not in seen_urls:
-                    seen_urls.add(f['url'])
-                    unique_files.append(f)
-            final_count = len(unique_files)
-            progress_text.text(f"Deep search complete!")
-            file_count_text.text(f"Found {final_count} unique files")
-            progress_bar.progress(1.0)
-            return unique_files
-        except Exception as e:
-            logger.error(f"Deep search error: {e}")
-            progress_text.text(f"Error during deep search: {str(e)}")
-            return []
-        finally:
-            await asyncio.sleep(2)
-            if not st.session_state.get('keep_progress', False):
-                progress_text.empty()
-                progress_bar.empty()class DownloadManager:
-    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
-        self.use_proxy = use_proxy
-        self.proxy = proxy
-        self.query = query
-        self.num_results = num_results
-        self.playwright = None
-        self.browser = None
-        self.context = None
-        self.page = None
-    async def __aenter__(self):
-        self.playwright = await async_playwright().start()
-        opts = {
-            "headless": True,
-            "args": [
-                '--no-sandbox',
-                '--disable-setuid-sandbox',
-                '--disable-dev-shm-usage',
-                '--disable-gpu',
-                '--no-zygote',
-                '--single-process'
-            ]
-        }
-        if self.use_proxy and self.proxy:
-            opts["proxy"] = {"server": self.proxy}
-        self.browser = await self.playwright.chromium.launch(**opts)
-        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
-        self.page = await self.context.new_page()
-        await self.page.set_extra_http_headers({
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Referer': 'https://www.bing.com/'
-        })
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self.browser:
-            await self.browser.close()
-        if self.playwright:
-            await self.playwright.stop()
-    async def search_bing(self):
-        urls = []
-        try:
-            search_url = f"https://www.bing.com/search?q={self.query}"
-            await self.page.goto(search_url, timeout=30000)
-            await self.page.wait_for_load_state('networkidle')
-            links = await self.page.query_selector_all("li.b_algo h2 a")
-            for link in links[:self.num_results]:
-                href = await link.get_attribute('href')
-                if href:
-                    urls.append(href)
-            return urls
         except Exception as e:
-            logger.error(f"Error searching Bing: {e}")
             return []
-    async def get_file_size(self, url):
-        try:
-            async with self.context.new_page() as page:
-                response = await page.request.head(url, timeout=15000)
-                length = response.headers.get('Content-Length', None)
-                if length:
-                    return sizeof_fmt(int(length))
-                else:
-                    return "Unknown Size"
-        except Exception:
-            return "Unknown Size"
-    async def get_pdf_metadata(self, url):
-        try:
-            async with self.context.new_page() as page:
-                resp = await page.request.get(url, timeout=15000)
-                if resp.ok:
-                    content = await resp.body()
-                    pdf = BytesIO(content)
-                    reader = PdfReader(pdf)
-                    return {
-                        'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
-                        'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
-                        'Pages': len(reader.pages),
-                    }
-                else:
-                    return {}
-        except Exception:
-            return {}
-    async def extract_real_download_url(self, url):
-        try:
-            async with self.context.new_page() as page:
-                response = await page.goto(url, wait_until='networkidle', timeout=30000)
-                if response and response.headers.get('location'):
-                    return response.headers['location']
-                return page.url
-        except Exception as e:
-            logger.error(f"Error extracting real download URL: {e}")
-            return url
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
@@ -2862,7 +1485,20 @@ class DownloadManager:
         try:
             logger.info(f"Fetching sublinks from: {url}")
-            # Go to page and wait for full load
             await self.page.goto(url, timeout=30000, wait_until='networkidle')
             # Get base URL for resolving relative links
@@ -3152,7 +1788,7 @@ class DownloadManager:
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
-                progress_bar.empty()
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)

             logger.error(f"Error extracting real download URL: {e}")
             return url
+    async def get_edu_exam_links(self, url):
+        """Specialized method for educational exam websites that follows a common pattern."""
         try:
+            logger.info(f"Fetching exam links from {url}")
+            links = set()
+            # Use requests for a faster initial scan
+            import requests
+            from bs4 import BeautifulSoup
+            from urllib.parse import urljoin, urlparse
+            headers = {"User-Agent": get_random_user_agent()}
+            response = requests.get(url, headers=headers, timeout=30)
+            if response.status_code != 200:
+                logger.warning(f"Failed to fetch page: {response.status_code}")
+                return []
+            # Parse with BeautifulSoup first for efficiency
+            soup = BeautifulSoup(response.text, "html.parser")
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            # Look for all links
+            for a in soup.find_all("a", href=True):
+                href = a["href"]
+                full_url = urljoin(url, href)
+                # Special patterns for exam sites
+                for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
+                                "/test/", "/download/", "/files/", "/assignments/"]:
+                    if pattern in full_url.lower():
+                        links.add(full_url)
+                        break
+            # If we didn't find many links with direct approach, use Playwright for more thorough extraction
+            if len(links) < 5:
+                logger.info("Using browser for enhanced link extraction")
+                await self.page.goto(url, timeout=30000, wait_until='networkidle')
+                # Check for ASP.NET specific elements that might contain exam links
+                grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
+                if grid_elements:
+                    for grid in grid_elements:
+                        grid_links = await grid.query_selector_all('a[href]')
+                        for a in grid_links:
+                            href = await a.get_attribute('href')
+                            if href:
+                                full_url = href if href.startswith('http') else urljoin(url, href)
+                                links.add(full_url)
+                # Try clicking any controls that might reveal more exam links
+                show_buttons = await self.page.query_selector_all('input[type="button"], button')
+                for button in show_buttons:
+                    button_text = await button.text_content() or ""
+                    button_value = await button.get_attribute("value") or ""
+                    if any(keyword in (button_text + button_value).lower() for keyword in
+                           ["show", "view", "display", "list", "exam", "paper", "test"]):
+                        try:
+                            await button.click()
+                            await self.page.wait_for_timeout(1000)
+                            await self.page.wait_for_load_state('networkidle', timeout=5000)
+                            # Get any new links that appeared
+                            new_links = await self.page.query_selector_all('a[href]')
+                            for a in new_links:
+                                href = await a.get_attribute('href')
+                                if href:
+                                    full_url = href if href.startswith('http') else urljoin(url, href)
+                                    links.add(full_url)
+                        except Exception as e:
+                            logger.warning(f"Error clicking button: {e}")
+            # Filter links to likely contain exam documents
+            filtered_links = []
+            for link in links:
+                # Common file extensions for exam documents
+                if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
+                    filtered_links.append(link)
+                    continue
+                # Common paths for exam documents
+                if any(pattern in link.lower() for pattern in [
+                    "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
+                    "/pastpapers/", "/questionpapers/", "/tests/"
+                ]):
+                    filtered_links.append(link)
+            logger.info(f"Found {len(filtered_links)} potential exam document links")
+            return filtered_links
         except Exception as e:
+            logger.error(f"Error getting exam links: {e}")
             return []
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
+            # Special handling for educational exam sites
+            if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
+                                                      ["exam", "test", "pastpaper", "eduexp"]):
+                logger.info("Using specialized handler for educational exam site")
+                # Get direct links to exam files
+                exam_links = await self.get_edu_exam_links(url)
+                for link in exam_links:
+                    # Try to resolve any redirection
+                    real_url = await self.extract_real_download_url(link)
+                    filename = os.path.basename(urlparse(real_url).path)
+                    # If filename is URL encoded (common with Chinese/international sites)
+                    if '%' in filename:
+                        try:
+                            from urllib.parse import unquote
+                            filename = unquote(filename)
+                        except Exception:
+                            pass
+                    # Get file size
+                    size_str = await self.get_file_size(real_url)
+                    # Get metadata for PDFs
+                    meta = {}
+                    if real_url.lower().endswith('.pdf'):
+                        try:
+                            meta = await self.get_pdf_metadata(real_url)
+                        except Exception:
+                            pass
+                    found_files.append({
+                        'url': real_url,
+                        'filename': filename,
+                        'size': size_str,
+                        'metadata': meta
+                    })
+                # If we found exam files with the specialized method, return them
+                if found_files:
+                    return found_files
+            # Standard extraction method if specialized method didn't find files
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
         try:
             logger.info(f"Fetching sublinks from: {url}")
+            # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
+            if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
+                                                      ["exam", "test", "pastpaper", "eduexp"]):
+                logger.info("Using specialized exam site sublink extraction")
+                edu_links = await self.get_edu_exam_links(url)
+                for link in edu_links:
+                    links.add(link)
+                # If we found a good number of links with the specialized method, return them
+                if len(links) > 5:
+                    logger.info(f"Found {len(links)} sublinks with specialized method")
+                    return list(links)[:limit]
+            # Standard sublink extraction for all sites
             await self.page.goto(url, timeout=30000, wait_until='networkidle')
             # Get base URL for resolving relative links
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
+                progress_bar.empty()
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)