Spaces:

euler314
/

craw_web

Running

File size: 144,605 Bytes

5d6326a

import os
import re
import random
import asyncio
import logging
import traceback
import tempfile
import shutil
import json
import time
from urllib.parse import urlparse, urljoin, unquote, parse_qs
from io import BytesIO
from bs4 import BeautifulSoup
import PyPDF2
import requests
from PIL import Image
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

from app.utils import (
    get_random_user_agent, sizeof_fmt, get_domain, is_download_link, 
    normalize_download_url, detect_captcha, USER_AGENTS, STEALTH_SETTINGS,
    PROXY_ROTATION_CONFIG
)

logger = logging.getLogger(__name__)

class DownloadManager:
    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False):
        self.use_proxy = use_proxy
        self.proxy = proxy
        self.query = query
        self.num_results = num_results
        self.playwright = None
        self.browser = None
        self.context = None
        self.page = None
        self.use_stealth = use_stealth
        self.proxy_rotation = proxy_rotation
        self.request_count = 0
        self.captcha_detected = False
        self.download_timeout = 300  # 5 minutes timeout for downloads
        # Track visited URLs to avoid revisiting the same URL multiple times
        self.visited_urls = set()
        # Track successfully downloaded files to avoid redownloading
        self.downloaded_files = set()

    async def __aenter__(self):
        self.playwright = await async_playwright().start()
        
        # Prepare browser args with stealth settings
        browser_args = [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-dev-shm-usage',
            '--disable-gpu',
            '--no-zygote',
            '--single-process',
            '--disable-web-security',
            '--disable-features=IsolateOrigins',
            '--disable-site-isolation-trials'
        ]
        
        # Add stealth-specific args
        if self.use_stealth:
            browser_args.extend([
                '--disable-blink-features=AutomationControlled',
                '--disable-features=IsolateOrigins,site-per-process',
                '--disable-webgl',
                '--disable-webrtc'
            ])
        
        # Setup browser options
        opts = {
            "headless": True,
            "args": browser_args
        }
        
        # Configure proxy if specified
        if self.use_proxy and self.proxy:
            opts["proxy"] = {"server": self.proxy}
        
        # Launch browser with options
        self.browser = await self.playwright.chromium.launch(**opts)
        
        # Setup browser context with enhanced settings
        context_opts = {
            "user_agent": get_random_user_agent(),
            "viewport": {"width": 1920, "height": 1080},
            "device_scale_factor": 1,
            "has_touch": False,
            "is_mobile": False,
            "ignore_https_errors": True,
            "accept_downloads": True
        }
        
        # Apply stealth-specific settings to the context
        if self.use_stealth:
            # Apply JS-injection for enhanced stealth 
            context_opts["bypass_csp"] = True
            self.context = await self.browser.new_context(**context_opts)
            
            # Execute stealth JS to avoid detection
            await self.context.add_init_script("""
            () => {
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => false,
                });
                
                // Change navigator properties
                const newProto = navigator.__proto__;
                delete newProto.webdriver;
                
                // Overwrite the plugins
                Object.defineProperty(navigator, 'plugins', {
                    get: () => [1, 2, 3, 4, 5].map(() => ({
                        lengthComputable: true,
                        loaded: 100,
                        total: 100
                    }))
                });
                
                // Handle languages more naturally
                Object.defineProperty(navigator, 'languages', {
                    get: () => ['en-US', 'en', 'es']
                });
                
                // Modify hardware concurrency
                Object.defineProperty(navigator, 'hardwareConcurrency', {
                    get: () => 4
                });
                
                // Modify deviceMemory
                Object.defineProperty(navigator, 'deviceMemory', {
                    get: () => 8
                });
                
                // WebGL modifications
                const getParameter = WebGLRenderingContext.prototype.getParameter;
                WebGLRenderingContext.prototype.getParameter = function(parameter) {
                    if (parameter === 37445) {
                        return 'Intel Inc.';
                    }
                    if (parameter === 37446) {
                        return 'Intel Iris OpenGL Engine';
                    }
                    return getParameter.apply(this, arguments);
                };
            }
            """)
        else:
            # Regular context without stealth
            self.context = await self.browser.new_context(**context_opts)
        
        # Create page with enhanced headers
        self.page = await self.context.new_page()
        await self.page.set_extra_http_headers({
            'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cache-Control': 'max-age=0',
            'DNT': '1',  # Do Not Track
            'Referer': 'https://www.google.com/',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'cross-site',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1'
        })
        
        # Add delay for mouse movements to simulate human behavior
        if self.use_stealth:
            await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500))
            await self.page.wait_for_timeout(random.randint(200, 500))
        
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()

    async def rotate_proxy_if_needed(self):
        """Rotate proxy if proxy rotation is enabled and threshold is reached"""
        if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]:
            self.request_count += 1
            if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]:
                # Get next proxy from the pool
                next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0)
                PROXY_ROTATION_CONFIG["proxies"].append(next_proxy)  # Move to end of list
                
                # Close existing context and create new one with the new proxy
                if self.context:
                    await self.context.close()
                
                # Create new context with the new proxy
                context_opts = {
                    "user_agent": get_random_user_agent(),
                    "proxy": {"server": next_proxy},
                    "accept_downloads": True
                }
                self.context = await self.browser.new_context(**context_opts)
                self.page = await self.context.new_page()
                
                # Reset counter
                self.request_count = 0
                logger.info(f"Rotated to new proxy: {next_proxy}")

    async def handle_captcha(self, page):
        """Detect and handle captchas if possible"""
        # Check for common captcha patterns
        content = await page.content()
        if detect_captcha(content):
            self.captcha_detected = True
            logger.warning("Captcha detected on page")
            
            # Strategies for handling captchas:
            # 1. For simple captchas, try to extract the image and solve it
            captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]')
            if captcha_img:
                logger.info("Found captcha image, attempting to capture")
                
                # Take screenshot of the captcha
                captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png")
                await captcha_img.screenshot(path=captcha_path)
                
                # In a real implementation, you would send this to a captcha solving service
                # For now, just log the detection
                logger.info(f"Captcha image saved to {captcha_path}")
                
                # For demonstration, we'll notify the user but not actually solve it
                return False
            
            # 2. For reCAPTCHA, special handling would be required
            recaptcha = await page.query_selector('iframe[src*="recaptcha"]')
            if recaptcha:
                logger.warning("reCAPTCHA detected, would require external solving service")
                return False
            
            # 3. Try to perform human-like actions that might bypass simple bot checks
            await self.perform_human_actions(page)
            
            # Check if captcha is still present
            content = await page.content()
            if detect_captcha(content):
                logger.warning("Captcha still present after human-like actions")
                return False
            else:
                logger.info("Captcha appears to be resolved")
                return True
                
        return True  # No captcha detected

    async def perform_human_actions(self, page):
        """Perform human-like actions on the page to possibly bypass simple bot checks"""
        try:
            # 1. Slowly scroll down the page
            for i in range(3):
                await page.evaluate(f"window.scrollTo(0, {i * 300})")
                await page.wait_for_timeout(random.randint(300, 700))
            
            # 2. Random mouse movements
            for _ in range(3):
                x = random.randint(100, 800)
                y = random.randint(100, 600)
                await page.mouse.move(x=x, y=y)
                await page.wait_for_timeout(random.randint(200, 500))
            
            # 3. Click on a non-essential part of the page
            try:
                await page.click("body", position={"x": 50, "y": 50})
            except:
                pass
            
            # 4. Wait a bit before continuing
            await page.wait_for_timeout(1000)
            
        except Exception as e:
            logger.warning(f"Error during human-like actions: {e}")

    async def search_bing(self):
        urls = []
        try:
            # Rotate proxy if needed
            await self.rotate_proxy_if_needed()
            
            search_url = f"https://www.bing.com/search?q={self.query}"
            await self.page.goto(search_url, timeout=30000)
            await self.page.wait_for_load_state('networkidle')
            
            # Check for captchas
            if not await self.handle_captcha(self.page):
                logger.warning("Captcha detected during search, results may be limited")
            
            # More natural scrolling behavior
            for i in range(3):
                await self.page.evaluate(f"window.scrollTo(0, {i * 400})")
                await self.page.wait_for_timeout(random.randint(300, 800))
            
            # Extract search results
            links = await self.page.query_selector_all("li.b_algo h2 a")
            for link in links[:self.num_results]:
                href = await link.get_attribute('href')
                if href:
                    urls.append(href)
            
            # If we didn't find enough results, try an alternative selector
            if len(urls) < self.num_results:
                alt_links = await self.page.query_selector_all(".b_caption a")
                for link in alt_links:
                    href = await link.get_attribute('href')
                    if href and href not in urls:
                        urls.append(href)
                        if len(urls) >= self.num_results:
                            break
            
            return urls
        except Exception as e:
            logger.error(f"Error searching Bing: {e}")
            return []

    async def get_file_size(self, url):
        try:
            await self.rotate_proxy_if_needed()
            
            # For complex download URLs, we need to be careful with HEAD requests
            if '?' in url or 'Action=downloadfile' in url or 'fname=' in url:
                # For these URLs, we'll try a more reliable approach using range headers
                headers = {
                    'User-Agent': get_random_user_agent(),
                    'Range': 'bytes=0-0'  # Just request the first byte to check headers
                }
                
                try:
                    with requests.get(url, headers=headers, stream=True, timeout=10) as r:
                        if 'Content-Range' in r.headers:
                            content_range = r.headers['Content-Range']
                            match = re.search(r'bytes 0-0/(\d+)', content_range)
                            if match:
                                size = int(match.group(1))
                                return sizeof_fmt(size)
                        
                        if 'Content-Length' in r.headers:
                            size = int(r.headers['Content-Length'])
                            # If size is 1, it's likely just our single requested byte
                            if size > 1:
                                return sizeof_fmt(size)
                except Exception as e:
                    logger.warning(f"Error getting file size with Range request: {e}")
                
                # Fallback to browser approach
                try:
                    async with self.context.new_page() as page:
                        response = await page.request.head(url, timeout=15000)
                        length = response.headers.get('Content-Length', None)
                        if length:
                            return sizeof_fmt(int(length))
                except Exception as e:
                    logger.warning(f"Error getting file size with browser: {e}")
                    
                return "Unknown Size"
            else:
                # Standard approach for normal URLs
                async with self.context.new_page() as page:
                    response = await page.request.head(url, timeout=15000)
                    length = response.headers.get('Content-Length', None)
                    if length:
                        return sizeof_fmt(int(length))
                    else:
                        return "Unknown Size"
        except Exception as e:
            logger.warning(f"Error getting file size: {e}")
            return "Unknown Size"

    async def get_pdf_metadata(self, url):
        try:
            await self.rotate_proxy_if_needed()
            
            async with self.context.new_page() as page:
                resp = await page.request.get(url, timeout=15000)
                if resp.ok:
                    content = await resp.body()
                    pdf = BytesIO(content)
                    reader = PyPDF2.PdfReader(pdf)
                    return {
                        'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
                        'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
                        'Pages': len(reader.pages),
                    }
                else:
                    return {}
        except Exception as e:
            logger.warning(f"Error reading PDF metadata: {e}")
            return {}

    async def extract_real_download_url(self, url):
        """Enhanced method to extract real download URL, handling complex URLs"""
        try:
            # Check if this is a complex download URL that needs special handling
            if 'Action=downloadfile' in url or 'fname=' in url:
                logger.info(f"Complex download URL detected: {url}")
                
                # For these special cases, we'll use the browser to navigate and intercept redirects
                await self.rotate_proxy_if_needed()
                
                async with self.context.new_page() as page:
                    # Set up request interception to capture redirects
                    await page.route('**', lambda route: route.continue_())
                    
                    # Listen for all responses
                    responses = []
                    page.on('response', lambda response: responses.append(response))
                    
                    try:
                        # Go to the URL
                        await page.goto(url, wait_until='networkidle', timeout=30000)
                        
                        # Check all responses for potential downloads
                        for response in responses:
                            # Look for content-disposition headers indicating a download
                            content_disposition = response.headers.get('Content-Disposition', '')
                            if 'attachment' in content_disposition or 'filename=' in content_disposition:
                                return response.url
                            
                            # Look for content-type headers indicating a file
                            content_type = response.headers.get('Content-Type', '')
                            if content_type and content_type != 'text/html' and not content_type.startswith('text/'):
                                return response.url
                        
                        # If no clear download was detected, return the final URL
                        return page.url
                    except Exception as e:
                        logger.warning(f"Error extracting real download URL: {e}")
                        return url
            else:
                # Standard approach for normal URLs
                await self.rotate_proxy_if_needed()
                
                async with self.context.new_page() as page:
                    response = await page.goto(url, wait_until='networkidle', timeout=30000)
                    if response and response.headers.get('location'):
                        return response.headers['location']
                    return page.url
        except Exception as e:
            logger.error(f"Error extracting real download URL: {e}")
            return url

    # IMPROVED: Enhanced exam links extraction method
    async def get_edu_exam_links(self, url):
        """Specialized method for educational exam websites that follows a common pattern."""
        try:
            logger.info(f"Fetching exam links from {url}")
            links = set()
            
            # First try with direct requests for speed (but with proper headers)
            headers = {
                "User-Agent": get_random_user_agent(),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://www.google.com/",
                "DNT": "1"
            }
            
            try:
                response = requests.get(url, headers=headers, timeout=30)
                
                if response.status_code == 200:
                    # Parse with BeautifulSoup first for efficiency
                    soup = BeautifulSoup(response.text, "html.parser")
                    parsed_base = urlparse(url)
                    base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
                    
                    # Look for all links
                    for a in soup.find_all("a", href=True):
                        href = a["href"]
                        full_url = urljoin(url, href)
                        
                        # Look for text clues
                        link_text = a.get_text().lower()
                        
                        # Special patterns for exam sites (expanded list)
                        url_patterns = [
                            "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", 
                            "/test/", "/download/", "/files/", "/assignments/",
                            "paper_", "question_", "exam_", "test_", "past_",
                            "assignment_", "sample_", "study_material", "notes_",
                            "/resource/", "/subject/", "/course/", "/material/"
                        ]
                        
                        text_patterns = [
                            "exam", "paper", "test", "question", "past", "download",
                            "assignment", "sample", "study", "material", "notes",
                            "subject", "course", "resource", "pdf", "document",
                            "view", "open", "get", "solution", "answer"
                        ]
                        
                        # Check URL for patterns
                        if any(pattern in full_url.lower() for pattern in url_patterns):
                            links.add(full_url)
                            continue
                            
                        # Check link text for patterns
                        if any(pattern in link_text for pattern in text_patterns):
                            links.add(full_url)
                            continue
                            
                        # Check for common file extensions
                        if any(full_url.lower().endswith(ext) for ext in 
                              ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                            links.add(full_url)
                    
                    # Check for download script parameters
                    if "Action=downloadfile" in url or "fname=" in url:
                        links.add(url)  # Add the URL itself as it's a download link
            except Exception as e:
                logger.warning(f"Request-based extraction failed: {e}")
            
            # Browser-based approach for more thorough extraction or if initial approach was inadequate
            try:
                # Check if we need to proceed with browser-based extraction
                if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url:
                    logger.info("Using browser for enhanced link extraction")
                    
                    # Rotate proxy if needed
                    await self.rotate_proxy_if_needed()
                    
                    # Navigate to the page with more natural timing
                    await self.page.goto(url, timeout=45000, wait_until='networkidle')
                    await self.page.wait_for_timeout(random.randint(1000, 2000))
                    
                    # Handle captchas if present
                    if not await self.handle_captcha(self.page):
                        logger.warning("Captcha detected, extraction may be limited")
                    
                    # Get base URL for resolving relative links
                    parsed_base = urlparse(url)
                    base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
                    
                    # Perform natural scrolling to trigger lazy-loaded content
                    page_height = await self.page.evaluate("document.body.scrollHeight")
                    viewport_height = await self.page.evaluate("window.innerHeight")
                    
                    for scroll_pos in range(0, page_height, viewport_height // 2):
                        await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})")
                        await self.page.wait_for_timeout(random.randint(300, 800))
                    
                    # Scroll back to top
                    await self.page.evaluate("window.scrollTo(0, 0)")
                    await self.page.wait_for_timeout(500)
                    
                    # Extract all links with Playwright (better than just anchor tags)
                    all_links = await self.page.evaluate("""
                        () => {
                            const results = [];
                            
                            // Get all anchor tags
                            const anchors = document.querySelectorAll('a[href]');
                            for (const a of anchors) {
                                if (a.href) {
                                    results.push({
                                        href: a.href,
                                        text: a.innerText || a.textContent || '',
                                        isButton: a.classList.contains('btn') || a.role === 'button'
                                    });
                                }
                            }
                            
                            // Get buttons that might contain links
                            const buttons = document.querySelectorAll('button');
                            for (const btn of buttons) {
                                const onclick = btn.getAttribute('onclick') || '';
                                if (onclick.includes('window.location') || onclick.includes('download')) {
                                    results.push({
                                        href: '#button',
                                        text: btn.innerText || btn.textContent || '',
                                        isButton: true,
                                        onclick: onclick
                                    });
                                }
                            }
                            
                            return results;
                        }
                    """)
                    
                    # Process the extracted links
                    for link_info in all_links:
                        href = link_info.get('href', '')
                        text = link_info.get('text', '').lower()
                        
                        if href and href != '#button':
                            # Check URL patterns
                            url_patterns = [
                                "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", 
                                "/test/", "/download/", "/files/", "/assignments/",
                                "paper_", "question_", "exam_", "test_", "past_",
                                "assignment_", "sample_", "study_material", "notes_"
                            ]
                            
                            # Check text patterns
                            text_patterns = [
                                "exam", "paper", "test", "question", "past", "download",
                                "assignment", "sample", "study", "material", "notes",
                                "pdf", "document", "view", "open", "solution"
                            ]
                            
                            if any(pattern in href.lower() for pattern in url_patterns) or \
                               any(pattern in text for pattern in text_patterns) or \
                               any(href.lower().endswith(ext) for ext in 
                                  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                links.add(href)
                    
                    # Check for download links in the page
                    download_links = await self.page.evaluate("""
                        () => {
                            // Find all links that might be download links
                            const links = Array.from(document.querySelectorAll('a[href]'));
                            return links
                                .filter(a => {
                                    const href = a.href.toLowerCase();
                                    return href.includes('download') || 
                                           href.includes('getfile') || 
                                           href.includes('view.php') ||
                                           href.includes('action=downloadfile') ||
                                           href.includes('fname=');
                                })
                                .map(a => a.href);
                        }
                    """)
                    
                    for dl_link in download_links:
                        links.add(dl_link)
                    
                    # Check for ASP.NET specific elements that might contain exam links
                    grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
                    for grid in grid_elements:
                        grid_links = await grid.query_selector_all('a[href]')
                        for a in grid_links:
                            href = await a.get_attribute('href')
                            text = await a.text_content()
                            
                            if href:
                                full_url = href if href.startswith('http') else urljoin(url, href)
                                links.add(full_url)
                    
                    # Try clicking pagination controls to reveal more content
                    pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a')
                    for i, button in enumerate(pagination_buttons[:5]):  # Limit to first 5 pagination buttons
                        try:
                            # Check if this is a numeric pagination button (more likely to be useful)
                            button_text = await button.text_content()
                            if button_text and button_text.strip().isdigit():
                                logger.info(f"Clicking pagination button: {button_text}")
                                await button.click()
                                await self.page.wait_for_timeout(2000)
                                await self.page.wait_for_load_state('networkidle', timeout=10000)
                                
                                # Extract links from this page
                                new_page_links = await self.page.evaluate("""
                                    () => {
                                        return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
                                    }
                                """)
                                
                                for href in new_page_links:
                                    if href and not href.startswith('javascript:'):
                                        if any(pattern in href.lower() for pattern in url_patterns) or \
                                           any(href.lower().endswith(ext) for ext in 
                                              ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                            links.add(href)
                        except Exception as e:
                            logger.warning(f"Error clicking pagination button: {e}")
                    
                    # Try clicking any controls that might reveal more exam links (more focused approach)
                    show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn')
                    for button in show_buttons:
                        button_text = (await button.text_content() or "").lower()
                        button_value = (await button.get_attribute("value") or "").lower()
                        button_id = (await button.get_attribute("id") or "").lower()
                        
                        # Look for buttons that seem likely to reveal file lists
                        promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", 
                                         "download", "resource", "material", "browse", "file"]
                        
                        if any(term in button_text or term in button_value or term in button_id 
                               for term in promising_terms):
                            try:
                                logger.info(f"Clicking button: {button_text or button_value}")
                                await button.click()
                                await self.page.wait_for_timeout(2000)
                                await self.page.wait_for_load_state('networkidle', timeout=10000)
                                
                                # Get any new links that appeared
                                new_links = await self.page.query_selector_all('a[href]')
                                for a in new_links:
                                    href = await a.get_attribute('href')
                                    if href:
                                        full_url = href if href.startswith('http') else urljoin(url, href)
                                        
                                        # Focus on file extensions and patterns
                                        if any(full_url.lower().endswith(ext) for ext in 
                                               ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \
                                           any(pattern in full_url.lower() for pattern in url_patterns):
                                            links.add(full_url)
                            except Exception as e:
                                logger.warning(f"Error clicking button: {e}")
                
                # Special handling for ASP.NET PostBack links
                try:
                    # Find and interact with ASP.NET __doPostBack elements
                    postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]')
                    for i, element in enumerate(postback_elements[:10]):  # Limit to avoid too many clicks
                        try:
                            onclick = await element.get_attribute('onclick')
                            if onclick and '__doPostBack' in onclick:
                                element_text = await element.text_content()
                                
                                # Only interact with elements that seem likely to contain exam links
                                promising_terms = ["show", "view", "list", "exam", "paper", "test", 
                                                "download", "resource", "material"]
                                                
                                if any(term in element_text.lower() for term in promising_terms):
                                    logger.info(f"Clicking ASP.NET postback element: {element_text}")
                                    
                                    # Click the element
                                    await element.click()
                                    await self.page.wait_for_timeout(2000)
                                    await self.page.wait_for_load_state('networkidle', timeout=10000)
                                    
                                    # Extract any new links
                                    new_links = await self.page.query_selector_all('a[href]')
                                    for a in new_links:
                                        href = await a.get_attribute('href')
                                        if href:
                                            full_url = href if href.startswith('http') else urljoin(url, href)
                                            if any(full_url.lower().endswith(ext) for ext in 
                                                ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                                links.add(full_url)
                        except Exception as e:
                            logger.warning(f"Error interacting with postback element: {e}")
                except Exception as e:
                    logger.warning(f"Error during postback handling: {e}")

            except Exception as e:
                logger.error(f"Browser-based extraction failed: {e}")
            
            # Filter links to likely contain exam documents
            filtered_links = []
            for link in links:
                # Common file extensions for exam documents
                if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                    filtered_links.append(link)
                    continue
                    
                # Common paths for exam documents
                if any(pattern in link.lower() for pattern in [
                    "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", 
                    "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
                    "/resource/", "/material/", "/notes/", "/subjectmaterial/"
                ]):
                    filtered_links.append(link)
                    continue
                
                # Check for download links (these may not have obvious extensions)
                if is_download_link(link):
                    filtered_links.append(link)
            
            logger.info(f"Found {len(filtered_links)} potential exam document links")
            return filtered_links
            
        except Exception as e:
            logger.error(f"Error getting exam links: {e}")
            return []

    async def discover_hidden_links(self, page):
        """Discover hidden links that might be in JavaScript, iframes, or dynamic content"""
        hidden_links = set()
        
        # Execute JavaScript to find links in script tags and data attributes
        js_links = await page.evaluate("""
        () => {
            const links = new Set();
            
            // Extract URLs from script tags
            const scripts = document.querySelectorAll('script');
            for (const script of scripts) {
                const content = script.textContent || '';
                const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || [];
                for (let match of urlMatches) {
                    links.add(match.replace(/["']/g, ''));
                }
            }
            
            // Look for download-related variables in scripts
            for (const script of scripts) {
                const content = script.textContent || '';
                // Look for common patterns for file URLs in JavaScript
                if (content.includes('downloadURL') || content.includes('fileURL') || 
                    content.includes('pdfURL') || content.includes('documentURL')) {
                    
                    // Extract potential URLs
                    const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || [];
                    for (let match of potentialUrls) {
                        const url = match.replace(/["']/g, '');
                        // Try to resolve relative URLs
                        if (url.startsWith('/') || !url.includes('://')) {
                            if (url.startsWith('/')) {
                                links.add(window.location.origin + url);
                            } else {
                                // Handle relative paths more carefully
                                const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
                                links.add(base + url);
                            }
                        } else if (url.startsWith('http')) {
                            links.add(url);
                        }
                    }
                }
            }
            
            // Check for links in data attributes
            const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]');
            for (const el of elements) {
                for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) {
                    const val = el.getAttribute(attr);
                    if (val) {
                        // Try to resolve relative URLs
                        if (val.startsWith('/')) {
                            links.add(window.location.origin + val);
                        } else if (val.startsWith('http')) {
                            links.add(val);
                        } else if (!val.startsWith('javascript:') && !val.startsWith('#')) {
                            // Handle relative paths
                            const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
                            links.add(base + val);
                        }
                    }
                }
            }
            
            // Look for URLs in inline event handlers
            const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]');
            for (const el of clickableElements) {
                for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) {
                    const val = el.getAttribute(attr);
                    if (val) {
                        // Check for JavaScript URLs with window.location
                        if (val.includes('window.location') || val.includes('document.location')) {
                            const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/);
                            if (urlMatch && urlMatch[1]) {
                                const url = urlMatch[1];
                                if (url.startsWith('/')) {
                                    links.add(window.location.origin + url);
                                } else if (url.startsWith('http')) {
                                    links.add(url);
                                } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
                                    const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
                                    links.add(base + url);
                                }
                            }
                        }
                        
                        // Check for direct URLs in attributes
                        const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
                        for (let match of urlMatches) {
                            links.add(match.replace(/["']/g, ''));
                        }
                        
                        // Check for download.php and similar patterns
                        if (val.includes('download.php') || val.includes('getfile.php') || 
                            val.includes('Action=downloadfile') || val.includes('viewfile.php')) {
                            
                            // Handle both onclick handlers and direct hrefs
                            let url = '';
                            if (attr === 'href') {
                                url = val;
                            } else {
                                // Extract URL from JavaScript
                                const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i);
                                if (jsUrlMatch) {
                                    url = jsUrlMatch[1];
                                }
                            }
                            
                            // Resolve URL if needed
                            if (url) {
                                if (url.startsWith('/')) {
                                    links.add(window.location.origin + url);
                                } else if (url.startsWith('http')) {
                                    links.add(url);
                                } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
                                    const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
                                    links.add(base + url);
                                }
                            }
                        }
                    }
                }
            }
            
            // Find PHP/ASP file download links
            const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]');
            for (const link of fileLinks) {
                links.add(link.href);
            }
            
            return Array.from(links);
        }
        """)
        
        for link in js_links:
            hidden_links.add(link)
        
        # Extract links from iframes
        iframes = await page.query_selector_all('iframe')
        for iframe in iframes:
            try:
                frame = await iframe.content_frame()
                if frame:
                    iframe_links = await frame.evaluate("""
                    () => {
                        return Array.from(document.querySelectorAll('a[href]'))
                            .map(a => a.href)
                            .filter(href => href.startsWith('http'));
                    }
                    """)
                    for link in iframe_links:
                        hidden_links.add(link)
            except Exception as e:
                logger.warning(f"Could not extract links from iframe: {e}")
        
        # Look for links in shadow DOM (used in modern web components)
        shadow_links = await page.evaluate("""
        () => {
            const links = new Set();
            
            // Helper function to recursively process shadow roots
            function processShadowRoot(root) {
                if (!root) return;
                
                // Get links in this shadow root
                const shadowLinks = root.querySelectorAll('a[href]');
                for (const link of shadowLinks) {
                    if (link.href && link.href.startsWith('http')) {
                        links.add(link.href);
                    }
                }
                
                // Process nested shadow roots
                const elements = root.querySelectorAll('*');
                for (const el of elements) {
                    if (el.shadowRoot) {
                        processShadowRoot(el.shadowRoot);
                    }
                }
            }
            
            // Find all shadow roots in the document
            const elements = document.querySelectorAll('*');
            for (const el of elements) {
                if (el.shadowRoot) {
                    processShadowRoot(el.shadowRoot);
                }
            }
            
            return Array.from(links);
        }
        """)
        
        for link in shadow_links:
            hidden_links.add(link)
        
        # Look for download links in forms
        form_links = await page.evaluate("""
        () => {
            const links = new Set();
            
            // Check for form actions that might be download endpoints
            const forms = document.querySelectorAll('form');
            for (const form of forms) {
                const action = form.action || '';
                if (action && (
                    action.includes('download') || 
                    action.includes('getfile') || 
                    action.includes('viewfile') || 
                    action.includes('Action=downloadfile')
                )) {
                    // Collect input values that might be needed for the download
                    const inputs = {};
                    const formInputs = form.querySelectorAll('input[name]');
                    for (const input of formInputs) {
                        inputs[input.name] = input.value;
                    }
                    
                    // Store both the form action and any important inputs
                    links.add(action);
                }
            }
            
            return Array.from(links);
        }
        """)
        
        for link in form_links:
            hidden_links.add(link)
        
        return hidden_links

    async def extract_downloadable_files(self, url, custom_ext_list):
        found_files = []
        try:
            # Normalize the URL to handle special cases
            normalized_url = normalize_download_url(url)
            
            # Skip if we've already visited this URL
            if normalized_url in self.visited_urls:
                logger.info(f"Skipping already visited URL: {normalized_url}")
                return []
            
            # Mark this URL as visited
            self.visited_urls.add(normalized_url)
            
            # Rotate proxy if needed
            await self.rotate_proxy_if_needed()
            
            # First check if this is a direct download link (Action=downloadfile or fname parameter)
            if is_download_link(normalized_url):
                logger.info(f"Processing potential direct download link: {normalized_url}")
                
                # Try to extract the real download URL if needed
                real_url = await self.extract_real_download_url(normalized_url)
                
                # Determine filename - for complex URLs this can be tricky
                filename = os.path.basename(urlparse(real_url).path)
                
                # Handle URL-encoded filenames
                if '%' in filename:
                    try:
                        filename = unquote(filename)
                    except Exception:
                        pass
                
                # For URLs with download parameters, try to extract filename from query
                if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
                    # Look for file parameter
                    params = parse_qs(urlparse(normalized_url).query)
                    
                    # Check common filename parameters
                    for param in ['file', 'filename', 'name', 'fname', 'f']:
                        if param in params and params[param]:
                            potential_filename = params[param][0]
                            if potential_filename and '/' not in potential_filename and '\\' not in potential_filename:
                                filename = os.path.basename(potential_filename)
                                break
                
                # If still no valid filename, use domain-based fallback
                if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
                    domain = get_domain(real_url)
                    # Try to determine file type from content-type or extension hints in URL
                    ext = '.pdf'  # Default
                    for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
                        if common_ext in normalized_url.lower():
                            ext = common_ext
                            break
                    filename = f"file_from_{domain}{ext}"
                
                # Get file size
                size_str = await self.get_file_size(real_url)
                
                # Add to found files
                found_files.append({
                    'url': real_url,
                    'filename': filename,
                    'size': size_str,
                    'metadata': {},
                    'download_url': normalized_url  # Keep original URL for downloading
                })
                
                # For direct download links, we can return early
                if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)):
                    return found_files
            
            # Special handling for educational exam sites
            if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in 
                                                      ["exam", "test", "pastpaper", "eduexp"]):
                logger.info("Using specialized handler for educational exam site")
                
                # Get direct links to exam files
                exam_links = await self.get_edu_exam_links(url)
                
                for link in exam_links:
                    # Try to resolve any redirection
                    real_url = await self.extract_real_download_url(link)
                    filename = os.path.basename(urlparse(real_url).path)
                    
                    # If filename is URL encoded (common with Chinese/international sites)
                    if '%' in filename:
                        try:
                            filename = unquote(filename)
                        except Exception:
                            pass
                    
                    # If filename is empty or invalid, create a sensible one
                    if not filename or filename == '/':
                        domain = get_domain(real_url)
                        ext = '.pdf'  # Default
                        for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
                            if common_ext in link.lower():
                                ext = common_ext
                                break
                        filename = f"file_from_{domain}{ext}"
                    
                    # Get file size
                    size_str = await self.get_file_size(real_url)
                    
                    # Get metadata for PDFs
                    meta = {}
                    if real_url.lower().endswith('.pdf'):
                        try:
                            meta = await self.get_pdf_metadata(real_url)
                        except Exception:
                            pass
                    
                    found_files.append({
                        'url': real_url,
                        'filename': filename,
                        'size': size_str,
                        'metadata': meta,
                        'download_url': link  # Store original link for downloading
                    })
                
                # If we found exam files with the specialized method, return them
                if found_files:
                    return found_files
            
            # Standard extraction method if specialized method didn't find files
            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
            if not response:
                return []
            
            # Check for captchas
            if not await self.handle_captcha(self.page):
                logger.warning("Captcha detected, file extraction may be limited")

            # Scroll through the page naturally to trigger lazy loading
            await self.page.evaluate("""
                (async () => {
                    const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
                    const height = document.body.scrollHeight;
                    const scrollStep = Math.floor(window.innerHeight / 2);
                    
                    for (let i = 0; i < height; i += scrollStep) {
                        window.scrollTo(0, i);
                        await delay(100);
                    }
                    
                    window.scrollTo(0, 0);
                })()
            """)
            await self.page.wait_for_timeout(1000)

            final_url = self.page.url
            if '.php' in final_url or 'download' in final_url:
                real_url = await self.extract_real_download_url(final_url)
                if real_url != final_url:
                    # Try to detect the filename from headers or URL
                    response = await self.page.request.head(real_url, timeout=15000)
                    filename = None
                    
                    # Try to get from Content-Disposition header
                    content_disposition = response.headers.get('Content-Disposition', '')
                    if 'filename=' in content_disposition:
                        filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition)
                        if filename_match:
                            filename = filename_match.group(1)
                    
                    # If not found in headers, use URL basename
                    if not filename:
                        filename = os.path.basename(urlparse(real_url).path)
                        if not filename or filename == '/':
                            # Generate a name based on domain
                            domain = get_domain(real_url)
                            ext = '.pdf'  # Default
                            for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
                                if common_ext in real_url.lower():
                                    ext = common_ext
                                    break
                            filename = f"file_from_{domain}{ext}"
                    
                    found_files.append({
                        'url': real_url,
                        'filename': filename,
                        'size': await self.get_file_size(real_url),
                        'metadata': {},
                        'download_url': final_url  # Keep original URL for downloading
                    })
                    return found_files

            await self.page.wait_for_load_state('networkidle', timeout=30000)
            content = await self.page.content()
            soup = BeautifulSoup(content, 'html.parser')
            
            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', 
                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', 
                            '.pptx', '.odt', '.txt']
            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
            
            parsed_base = urlparse(final_url)
            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
            path_base = os.path.dirname(parsed_base.path)
            
            # Process all anchor tags
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                
                if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower():
                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                    real_url = await self.extract_real_download_url(full_url)
                    if real_url and real_url != full_url:
                        found_files.append({
                            'url': real_url,
                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                            'size': await self.get_file_size(real_url),
                            'metadata': {},
                            'download_url': full_url  # Original URL for download
                        })
                        continue

                if any(href.lower().endswith(ext) for ext in all_exts):
                    file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                    size_str = await self.get_file_size(file_url)
                    meta = {}
                    if file_url.lower().endswith('.pdf'):
                        meta = await self.get_pdf_metadata(file_url)
                    found_files.append({
                        'url': file_url,
                        'filename': os.path.basename(file_url.split('?')[0]),
                        'size': size_str,
                        'metadata': meta,
                        'download_url': file_url  # Same as URL for direct links
                    })

                # Handle Google Drive links
                elif ("drive.google.com" in href) or ("docs.google.com" in href):
                    file_id = None
                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
                        match = re.search(pattern, href)
                        if match:
                            file_id = match.group(1)
                            break
                    if file_id:
                        # Get file info to determine type and view-only status
                        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
                        
                        # Create a more informative filename based on info
                        filename = f"gdrive_{file_id}"
                        if file_type:
                            filename = f"{filename}.{file_type}"
                        
                        size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
                        
                        found_files.append({
                            'url': href,  # Use original URL
                            'filename': filename,
                            'size': size_str,
                            'metadata': {
                                'view_only': is_view_only,
                                'file_type': file_type,
                                'file_id': file_id
                            },
                            'download_url': href  # Same as URL for Google Drive
                        })
            
            # Also check for files in other elements (iframe, embed, object, etc.)
            other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
            for elem in other_elements:
                src = elem.get('src') or elem.get('data')
                if src and any(src.lower().endswith(ext) for ext in all_exts):
                    file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
                    size_str = await self.get_file_size(file_url)
                    meta = {}
                    if file_url.lower().endswith('.pdf'):
                        meta = await self.get_pdf_metadata(file_url)
                    found_files.append({
                        'url': file_url,
                        'filename': os.path.basename(file_url.split('?')[0]),
                        'size': size_str,
                        'metadata': meta,
                        'download_url': file_url
                    })
            
            # Check for file links in onclick attributes
            onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
            for elem in onclick_elements:
                onclick = await elem.get_attribute('onclick')
                urls = re.findall(r'(https?://[^\'"]+)', onclick)
                for url_match in urls:
                    if any(url_match.lower().endswith(ext) for ext in all_exts):
                        size_str = await self.get_file_size(url_match)
                        meta = {}
                        if url_match.lower().endswith('.pdf'):
                            meta = await self.get_pdf_metadata(url_match)
                        found_files.append({
                            'url': url_match,
                            'filename': os.path.basename(url_match.split('?')[0]),
                            'size': size_str,
                            'metadata': meta,
                            'download_url': url_match
                        })
            
            # Also check for data-src and data-url attributes (common in lazy-loaded sites)
            data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]')
            for elem in data_elements:
                for attr in ['data-src', 'data-url', 'data-href', 'data-download']:
                    try:
                        value = await elem.get_attribute(attr)
                        if value and any(value.lower().endswith(ext) for ext in all_exts):
                            file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
                            found_files.append({
                                'url': file_url,
                                'filename': os.path.basename(file_url.split('?')[0]),
                                'size': await self.get_file_size(file_url),
                                'metadata': {},
                                'download_url': file_url
                            })
                    except:
                        pass
            
            # Check script tags for JSON data that might contain file URLs
            script_elements = soup.find_all('script', type='application/json')
            for script in script_elements:
                try:
                    json_data = json.loads(script.string)
                    # Look for URL patterns in the JSON data
                    def extract_urls_from_json(obj, urls_found=None):
                        if urls_found is None:
                            urls_found = []
                        if isinstance(obj, dict):
                            for k, v in obj.items():
                                # Check if any key contains url-like terms
                                url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download']
                                if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'):
                                    urls_found.append(v)
                                else:
                                    extract_urls_from_json(v, urls_found)
                        elif isinstance(obj, list):
                            for item in obj:
                                extract_urls_from_json(item, urls_found)
                        return urls_found
                    
                    json_urls = extract_urls_from_json(json_data)
                    for json_url in json_urls:
                        if any(json_url.lower().endswith(ext) for ext in all_exts):
                            found_files.append({
                                'url': json_url,
                                'filename': os.path.basename(json_url.split('?')[0]),
                                'size': await self.get_file_size(json_url),
                                'metadata': {},
                                'download_url': json_url
                            })
                except:
                    pass
            
            # Check for hidden download buttons or forms
            hidden_elements = await self.page.evaluate("""
                () => {
                    const results = [];
                    
                    // Check for hidden forms with download actions
                    const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]');
                    for (const form of forms) {
                        const action = form.getAttribute('action') || '';
                        results.push({
                            type: 'form',
                            action: action,
                            inputs: Array.from(form.querySelectorAll('input[name]')).map(input => {
                                return {name: input.name, value: input.value};
                            })
                        });
                    }
                    
                    // Check for hidden download links/buttons
                    const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => {
                        const style = window.getComputedStyle(a);
                        return (style.display === 'none' || style.visibility === 'hidden') && 
                               (a.href.includes('download') || a.href.includes('file'));
                    });
                    
                    for (const link of hiddenLinks) {
                        results.push({
                            type: 'link',
                            href: link.href,
                            text: link.innerText || link.textContent
                        });
                    }
                    
                    return results;
                }
            """)
            
            # Process hidden elements
            for elem in hidden_elements:
                if elem['type'] == 'link' and 'href' in elem:
                    href = elem['href']
                    if any(href.lower().endswith(ext) for ext in all_exts):
                        found_files.append({
                            'url': href,
                            'filename': os.path.basename(href.split('?')[0]),
                            'size': await self.get_file_size(href),
                            'metadata': {},
                            'download_url': href
                        })
            
            # Check for hidden links that might be in JavaScript, iframes, or dynamic content
            hidden_links = await self.discover_hidden_links(self.page)
            for link in hidden_links:
                if any(link.lower().endswith(ext) for ext in all_exts):
                    found_files.append({
                        'url': link,
                        'filename': os.path.basename(link.split('?')[0]),
                        'size': await self.get_file_size(link),
                        'metadata': {},
                        'download_url': link
                    })
            
            # Deduplicate files by URL
            seen_urls = set()
            unique_files = []
            for f in found_files:
                if f['url'] not in seen_urls:
                    seen_urls.add(f['url'])
                    unique_files.append(f)
            
            return unique_files
        except Exception as e:
            logger.error(f"Error extracting files from {url}: {e}")
            traceback.print_exc()
            return []

    async def download_file(self, file_info, save_dir, referer):
        file_url = file_info.get('download_url', file_info['url'])  # Use download_url if available
        fname = file_info['filename']
        path = os.path.join(save_dir, fname)
        base, ext = os.path.splitext(fname)
        counter = 1
        while os.path.exists(path):
            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
            counter += 1
        os.makedirs(save_dir, exist_ok=True)
        
        # Check if we've already downloaded this file
        if file_url in self.downloaded_files:
            logger.info(f"File already downloaded: {file_url}")
            return None
            
        try:
            # Special handling for Google Drive files
            if "drive.google.com" in file_url or "docs.google.com" in file_url:
                # Check if it's marked as view-only in metadata
                is_view_only = file_info.get('metadata', {}).get('view_only', False)
                
                # For view-only files, try our most robust approach first
                if is_view_only:
                    logger.info(f"Attempting to download view-only file: {file_url}")
                    result_path = await self._force_download_viewonly(file_info, path)
                    if result_path:
                        self.downloaded_files.add(file_url)
                        return result_path
                    
                    # If that failed, try the regular download approach
                    logger.info("Primary method failed, trying fallback methods")
                
                # Try regular download methods
                success = await self._download_from_google_drive(file_url, path)
                if success:
                    self.downloaded_files.add(file_url)
                    return path
                
                # If all methods failed for Google Drive, try one last approach
                logger.warning("All standard methods failed, attempting force download")
                result_path = await self._force_download_viewonly(file_info, path)
                if result_path:
                    self.downloaded_files.add(file_url)
                return result_path if result_path else None
            
            # Special handling for complex download URLs
            if 'Action=downloadfile' in file_url or 'fname=' in file_url:
                logger.info(f"Using browser download approach for complex URL: {file_url}")
                
                # For these URLs, we'll need to navigate to the page and handle the download
                await self.rotate_proxy_if_needed()
                
                async with self.context.new_page() as page:
                    # Set up download event listener
                    download_promise = page.wait_for_event("download")
                    
                    # Navigate to the URL
                    await page.goto(file_url, timeout=60000)
                    
                    # Wait for the download to start
                    try:
                        download = await download_promise
                        await download.save_as(path)
                        
                        if os.path.exists(path) and os.path.getsize(path) > 0:
                            self.downloaded_files.add(file_url)
                            return path
                    except Exception as e:
                        logger.error(f"Browser download failed: {e}")
                        
                        # If download didn't start automatically, try to find and click download buttons
                        download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]')
                        for button in download_buttons:
                            try:
                                await button.click()
                                try:
                                    download = await download_promise
                                    await download.save_as(path)
                                    if os.path.exists(path) and os.path.getsize(path) > 0:
                                        self.downloaded_files.add(file_url)
                                        return path
                                except:
                                    pass
                            except:
                                continue
                
                # If browser approach failed, try direct request as last resort
                logger.info("Browser approach failed, trying direct request")
            
            # Rotate proxy if needed
            await self.rotate_proxy_if_needed()
            
            # Try with direct requests first (faster)
            try:
                headers = {
                    'User-Agent': get_random_user_agent(),
                    'Accept': '*/*',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Referer': referer,
                    'DNT': '1'
                }
                
                with requests.get(file_url, headers=headers, stream=True, timeout=30) as response:
                    if response.status_code == 200:
                        # Check content type to verify it's not HTML/error page
                        content_type = response.headers.get('Content-Type', '')
                        if 'text/html' in content_type and not file_url.endswith('.html'):
                            logger.warning(f"Received HTML instead of expected file: {file_url}")
                        else:
                            with open(path, 'wb') as f:
                                for chunk in response.iter_content(chunk_size=8192):
                                    if chunk:
                                        f.write(chunk)
                            
                            # Verify file was downloaded correctly
                            if os.path.exists(path) and os.path.getsize(path) > 0:
                                self.downloaded_files.add(file_url)
                                return path
            except Exception as e:
                logger.warning(f"Direct download failed: {e}, trying browser approach")
                    
            # Original code for non-Google Drive downloads using Playwright
            async with self.context.new_page() as page:
                headers = {
                    'Accept': '*/*',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Referer': referer
                }
                
                # Try to download with timeout protection
                try:
                    response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000)
                    if response.status == 200:
                        content = await response.body()
                        with open(path, 'wb') as f:
                            f.write(content)
                        if os.path.exists(path) and os.path.getsize(path) > 0:
                            self.downloaded_files.add(file_url)
                            return path
                    else:
                        logger.error(f"Download failed with status {response.status}: {file_url}")
                        
                        # Try to extract error information
                        error_info = await response.text()
                        logger.debug(f"Error response: {error_info[:200]}...")
                        
                        # Check if this might be a captcha or login issue
                        if detect_captcha(error_info):
                            logger.warning("Captcha detected during download")
                            # For HF Spaces, we can't implement browser-based captcha solving here
                            # Just log the issue for now
                except PlaywrightTimeoutError:
                    logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}")
                    
                # Try an alternative approach - using the browser's download manager
                try:
                    logger.info("Trying browser download manager approach")
                    download_promise = page.wait_for_event("download")
                    await page.goto(file_url, timeout=60000)
                    
                    # Wait for download to start (with timeout)
                    download = await download_promise
                    await download.save_as(path)
                    
                    if os.path.exists(path) and os.path.getsize(path) > 0:
                        self.downloaded_files.add(file_url)
                        return path
                except Exception as e:
                    logger.error(f"Browser download manager approach failed: {e}")
                
                return None
        except Exception as e:
            logger.error(f"Error downloading {file_url}: {e}")
            return None

    # IMPROVED: Split force_download_viewonly into smaller methods
    async def _force_download_viewonly(self, file_info, save_path):
        """Main method to handle view-only files, now simplified"""
        # Extract the file ID
        file_id = self._extract_drive_file_id(file_info)
        if not file_id:
            logger.error("Could not extract file ID")
            return None
        
        # Get file type information
        file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
        base, ext = os.path.splitext(save_path)
        if not ext:
            save_path = f"{base}.{file_type}"
        
        logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
        
        # Create a stealth browser for handling the download
        browser = await self._create_stealth_browser()
        
        try:
            # Set up the browser page
            page = await browser.new_page()
            
            # Go to the file view page
            logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
            await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
            await page.wait_for_load_state('networkidle')
            
            # Check for permission issues
            content = await page.content()
            if "the owner has not granted you permission to" in content:
                logger.warning("Permission denied error detected")
                return None
            
            # Wait for the page to stabilize
            await page.wait_for_timeout(random.randint(3000, 7000))
            
            # Create temp directory for working files
            temp_dir = tempfile.mkdtemp()
            
            # Handle different file types
            if file_type.lower() == 'pdf':
                return await self._download_viewonly_pdf(page, file_id, save_path, temp_dir)
            else:
                return await self._download_viewonly_other(page, file_id, file_type, save_path, temp_dir)
                
        except Exception as e:
            logger.error(f"Error during force download: {e}")
            return None
        finally:
            await browser.close()

    def _extract_drive_file_id(self, file_info):
        """Extract Google Drive file ID from file info"""
        # Try to get file ID from metadata
        file_id = file_info.get('metadata', {}).get('file_id')
        if file_id:
            return file_id
            
        # If not in metadata, try to extract from URL
        url = file_info.get('url', '')
        for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
            match = re.search(pattern, url)
            if match:
                return match.group(1)
                
        return None

    async def _create_stealth_browser(self):
        """Create a stealth browser instance for handling sensitive downloads"""
        browser_args = [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-dev-shm-usage',
            '--disable-web-security',
            '--disable-features=IsolateOrigins,site-per-process',
            '--disable-site-isolation-trials',
            '--disable-blink-features=AutomationControlled'  # Anti-detection
        ]
        
        browser = await self.playwright.chromium.launch(
            headless=True,
            args=browser_args
        )
        
        # Use higher resolution for better quality
        context = await browser.new_context(
            viewport={'width': 1600, 'height': 1200},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            device_scale_factor=2.0,
            accept_downloads=True  # Critical for the download workflow
        )
        
        # Add anti-detection script
        await context.add_init_script("""
            () => {
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => false,
                });
                
                // Change plugins
                Object.defineProperty(navigator, 'plugins', {
                    get: () => [1, 2, 3, 4, 5].map(() => ({
                        lengthComputable: true,
                        loaded: 100,
                        total: 100
                    }))
                });
                
                // Handle languages
                Object.defineProperty(navigator, 'languages', {
                    get: () => ['en-US', 'en', 'es']
                });
                
                // Modify hardware concurrency
                Object.defineProperty(navigator, 'hardwareConcurrency', {
                    get: () => 4
                });
            }
        """)
        
        return browser

    async def _download_viewonly_pdf(self, page, file_id, save_path, temp_dir):
        """Handle downloading view-only PDF files"""
        try:
            # Estimate number of pages
            estimated_pages = await page.evaluate("""
            () => {
                // Method 1: Check page counter text
                const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
                    const text = el.textContent || '';
                    return /\\d+\\s*\\/\\s*\\d+/.test(text);
                });
                
                if (pageCounters.length > 0) {
                    const text = pageCounters[0].textContent || '';
                    const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
                    if (match && match[2]) return parseInt(match[2]);
                }
                
                // Method 2: Check actual page elements
                const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
                if (pageElements.length > 0) return pageElements.length;
                
                // Method 3: Look for page thumbnails
                const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
                if (thumbnails.length > 0) return thumbnails.length;
                
                // Fallback: conservative guess 
                return 50;
            }
            """)
            
            logger.info(f"Estimated {estimated_pages} pages in PDF")
            
            # Initial scroll to trigger lazy loading
            logger.info("Initial scroll to bottom to trigger lazy loading...")
            await page.keyboard.press("End")
            await page.wait_for_timeout(3000)
            
            # Scroll page by page to ensure all pages are loaded
            logger.info("Scrolling page by page...")
            max_attempts = min(estimated_pages * 3, 300)
            attempt = 0
            prev_blob_count = 0
            
            while attempt < max_attempts:
                blob_count = await page.evaluate("""
                    Array.from(document.getElementsByTagName('img'))
                        .filter(img => img.src.startsWith('blob:') && img.width > 100)
                        .length
                """)
                
                logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
                
                if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
                    logger.info("All pages appear to be loaded.")
                    break
                
                # Alternate between PageDown and End keys for more natural scrolling
                if attempt % 3 == 0:
                    await page.keyboard.press("End")
                else:
                    await page.keyboard.press("PageDown")
                    
                # Randomized wait times
                await page.wait_for_timeout(random.randint(1500, 3000))
                
                # Move mouse randomly to appear more human-like
                if attempt % 4 == 0:
                    await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800))
                
                prev_blob_count = blob_count
                attempt += 1
            
            # Extra wait to ensure everything is loaded
            await page.wait_for_timeout(5000)
            
            # Set up download event listener for the PDF
            download_promise = page.wait_for_event("download")
            
            # Use jsPDF to generate PDF from loaded pages
            logger.info("Generating PDF from loaded pages...")
            result = await page.evaluate(r'''
                (function() {
                    return new Promise((resolve, reject) => {
                        let script = document.createElement("script");
                        script.onload = function () {
                            try {
                                let pdf = new jsPDF();
                                let imgs = Array.from(document.getElementsByTagName("img"))
                                    .filter(img => img.src.startsWith('blob:') && img.width > 100)
                                    .sort((a, b) => {
                                        const rectA = a.getBoundingClientRect();
                                        const rectB = b.getBoundingClientRect();
                                        return rectA.top - rectB.top;
                                    });
                                
                                console.log(`Found ${imgs.length} valid page images to add to PDF`);
                                
                                let added = 0;
                                for (let i = 0; i < imgs.length; i++) {
                                    let img = imgs[i];
                                    let canvas = document.createElement("canvas");
                                    let ctx = canvas.getContext("2d");
                                    canvas.width = img.width;
                                    canvas.height = img.height;
                                    ctx.drawImage(img, 0, 0, img.width, img.height);
                                    let imgData = canvas.toDataURL("image/jpeg", 1.0);
                                    
                                    if (added > 0) {
                                        pdf.addPage();
                                    }
                                    
                                    pdf.addImage(imgData, 'JPEG', 0, 0);
                                    added++;
                                }
                                
                                pdf.save("download.pdf");
                                resolve({success: true, pageCount: added});
                            } catch (error) {
                                reject({success: false, error: error.toString()});
                            }
                        };
                        
                        script.onerror = function() {
                            reject({success: false, error: "Failed to load jsPDF library"});
                        };
                        
                        script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
                        document.body.appendChild(script);
                    });
                })();
            ''')
            
            if not result.get('success', False):
                logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
                
                # Try fallback approach - screenshot method
                logger.info("Trying fallback screenshot method...")
                return await self._pdf_screenshot_fallback(page, estimated_pages, save_path, temp_dir)
            
            logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
            
            # Wait for the download and save it
            download = await download_promise
            await download.save_as(save_path)
            
            # Clean up temp directory
            try:
                os.rmdir(temp_dir)
            except:
                pass
            
            # Verify file exists and has content
            if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
                logger.info(f"Successfully downloaded PDF to {save_path}")
                return save_path
            else:
                logger.error(f"Generated file is too small or missing: {save_path}")
                return None
                
        except Exception as e:
            logger.error(f"Error in PDF download: {e}")
            return None

    async def _pdf_screenshot_fallback(self, page, estimated_pages, save_path, temp_dir):
        """Fallback method using screenshots for PDF creation"""
        try:
            # Navigate back to the first page
            await page.evaluate("""
                () => {
                    // Find and click the "first page" button if available
                    const buttons = Array.from(document.querySelectorAll('button'));
                    const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page'));
                    if (firstPageBtn) firstPageBtn.click();
                }
            """)
            await page.wait_for_timeout(1000);
            
            # Create a PDF by taking screenshots of each page
            screenshots = []
            current_page = 1
            max_pages = estimated_pages
            
            # Create a PDF using the reportlab package
            while current_page <= max_pages:
                screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png")
                
                # Try to find the current page element
                page_elem = await page.query_selector('.drive-viewer-paginated-page')
                if page_elem:
                    await page_elem.screenshot(path=screenshot_path)
                else:
                    # Fallback to full page screenshot
                    await page.screenshot(path=screenshot_path)
                
                screenshots.append(screenshot_path)
                
                # Try to navigate to next page
                next_btn = await page.query_selector('button[aria-label="Next page"]')
                if next_btn:
                    is_disabled = await next_btn.get_attribute('disabled')
                    if is_disabled:
                        logger.info(f"Reached end of document at page {current_page}")
                        break
                    
                    await next_btn.click()
                    await page.wait_for_timeout(1000)
                    current_page += 1
                else:
                    break
            
            # Create PDF from screenshots
            if screenshots:
                first_img = Image.open(screenshots[0])
                width, height = first_img.size
                
                c = canvas.Canvas(save_path, pagesize=(width, height))
                for screenshot in screenshots:
                    img = Image.open(screenshot)
                    c.drawImage(screenshot, 0, 0, width, height)
                    c.showPage()
                c.save()
                
                # Clean up screenshots
                for screenshot in screenshots:
                    os.remove(screenshot)
                
                return save_path
                
            return None
        except Exception as e:
            logger.error(f"Error in screenshot fallback: {e}")
            return None

    async def _download_viewonly_other(self, page, file_id, file_type, save_path, temp_dir):
        """Handle downloading non-PDF view-only files"""
        try:
            # Take a screenshot of the file
            screenshot_path = os.path.join(temp_dir, "file.png")
            await page.screenshot(path=screenshot_path)
            
            if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
                # For document types, try to export directly
                success = await self._export_google_doc(file_id, file_type, save_path)
                if success:
                    os.remove(screenshot_path)
                    return save_path
                    
                # If export fails, fall back to screenshot
                logger.warning(f"Export failed, falling back to screenshot for {file_type}")
            
            # For other types or if export failed, save the screenshot with appropriate extension
            shutil.copy(screenshot_path, save_path)
            os.remove(screenshot_path)
            
            return save_path if os.path.exists(save_path) else None
            
        except Exception as e:
            logger.error(f"Error in non-PDF download: {e}")
            return None

    async def _download_from_google_drive(self, url, save_path):
        """Enhanced method to download from Google Drive with multiple fallback approaches"""
        # Extract the file ID from different URL formats
        file_id = self._extract_drive_file_id({"url": url})
        if not file_id:
            logger.error(f"Could not extract file ID from URL: {url}")
            return False
        
        # Determine file type first (important for handling different file types)
        file_type, is_view_only = await self._get_google_drive_file_info(file_id)
        logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
        
        base, ext = os.path.splitext(save_path)
        if not ext and file_type:
            # Add the correct extension if missing
            save_path = f"{base}.{file_type}"
        
        # For view-only files, use specialized approaches
        if is_view_only:
            # Approach 1: For PDFs, use the JS method
            if file_type == 'pdf':
                success = await self._download_viewonly_pdf_with_js(file_id, save_path)
                if success:
                    return True
                    
            # Approach 2: For Google Docs, Sheets, etc., use export API
            if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
                success = await self._export_google_doc(file_id, file_type, save_path)
                if success:
                    return True
                    
            # Fallback to the main view-only method
            result_path = await self._force_download_viewonly({
                'url': url,
                'metadata': {'file_id': file_id, 'file_type': file_type, 'view_only': True}
            }, save_path)
            
            return bool(result_path)
        
        # Try standard approaches for non-view-only files
        try:
            # Try direct download link first (fastest)
            direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
            
            # Add anti-bot headers
            headers = {
                'User-Agent': get_random_user_agent(),
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.9',
                'Referer': 'https://drive.google.com/',
                'DNT': '1'
            }
            
            # Try with streaming to handle larger files
            with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r:
                if r.status_code == 200:
                    # Check if we got HTML instead of the file
                    content_type = r.headers.get('Content-Type', '')
                    if 'text/html' in content_type and not file_id.endswith('.html'):
                        logger.warning("Received HTML instead of file, trying with session cookies")
                    else:
                        # Looks like we got the actual file
                        with open(save_path, 'wb') as f:
                            for chunk in r.iter_content(chunk_size=8192):
                                if chunk:
                                    f.write(chunk)
                        
                        # Verify file exists and has content
                        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
                            logger.info("Direct download successful")
                            return True
            
            # Try browser-based approach as last resort
            try:
                async with self.context.new_page() as page:
                    # Visit the file view page first to get cookies
                    await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
                    await page.wait_for_timeout(3000)
                    
                    # Set up download event listener
                    download_promise = page.wait_for_event("download")
                    
                    # Try to trigger the download button click
                    download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]')
                    if download_button:
                        await download_button.click()
                        
                        # Wait for download to start
                        try:
                            download = await download_promise
                            await download.save_as(save_path)
                            return os.path.exists(save_path) and os.path.getsize(save_path) > 0
                        except Exception as e:
                            logger.error(f"Error during browser download: {e}")
                            return False
                    else:
                        # Try the export download URL
                        await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000)
                        
                        # Look for and click any download buttons or links
                        download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")')
                        for elem in download_elements:
                            try:
                                await elem.click()
                                # Wait a bit to see if download starts
                                try:
                                    download = await download_promise
                                    await download.save_as(save_path)
                                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
                                except:
                                    pass
                            except:
                                continue
            except Exception as e:
                logger.error(f"Browser-based download attempt failed: {e}")
            
            logger.warning("All standard download methods failed")
            return False
        except Exception as e:
            logger.error(f"Error in Google Drive download: {e}")
            return False

    async def _download_viewonly_pdf_with_js(self, file_id, save_path):
        """Download view-only PDF using blob images and JS"""
        try:
            # Create a dedicated browser instance
            browser = await self._create_stealth_browser()
            page = await browser.new_page()
            
            try:
                # Navigate to the file with human-like behavior
                logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
                await page.wait_for_load_state('networkidle')
                
                # Perform human-like interactions
                await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300))
                await page.wait_for_timeout(random.randint(2000, 5000))
                
                # Estimate the number of pages
                estimated_pages = await page.evaluate("""
                    () => {
                        // Look for page counter in the interface
                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
                            const text = el.textContent || '';
                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
                        });
                        
                        if (pageCounters.length > 0) {
                            const text = pageCounters[0].textContent || '';
                            const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
                            if (match && match[2]) return parseInt(match[2]);
                        }
                        
                        // If we can't find a counter, check actual pages
                        const pages = document.querySelectorAll('.drive-viewer-paginated-page');
                        if (pages.length > 0) return pages.length;
                        
                        // Default to a reasonable number if we can't determine
                        return 50;
                    }
                """)
                
                logger.info(f"Estimated number of pages: {estimated_pages}")
                
                # Initial scroll to trigger loading
                logger.info("Initial scroll to bottom to trigger lazy loading...")
                await page.keyboard.press("End")
                await page.wait_for_timeout(3000)
                
                # Scroll through document with variety to appear natural
                await self._natural_scroll_through_document(page, estimated_pages)
                
                # Set up download event listener
                download_promise = page.wait_for_event("download")
                
                # Use jsPDF to generate PDF from loaded pages
                logger.info("Generating PDF from loaded pages...")
                result = await page.evaluate(r'''
                    (function() {
                        return new Promise((resolve, reject) => {
                            let script = document.createElement("script");
                            script.onload = function () {
                                try {
                                    let pdf = new jsPDF();
                                    let imgs = Array.from(document.getElementsByTagName("img"))
                                        .filter(img => img.src.startsWith('blob:') && img.width > 100)
                                        .sort((a, b) => {
                                            const rectA = a.getBoundingClientRect();
                                            const rectB = b.getBoundingClientRect();
                                            return rectA.top - rectB.top;
                                        });
                                    
                                    console.log(`Found ${imgs.length} valid page images to add to PDF`);
                                    
                                    let added = 0;
                                    for (let i = 0; i < imgs.length; i++) {
                                        let img = imgs[i];
                                        let canvas = document.createElement("canvas");
                                        let ctx = canvas.getContext("2d");
                                        canvas.width = img.width;
                                        canvas.height = img.height;
                                        ctx.drawImage(img, 0, 0, img.width, img.height);
                                        let imgData = canvas.toDataURL("image/jpeg", 1.0);
                                        
                                        if (added > 0) {
                                            pdf.addPage();
                                        }
                                        
                                        pdf.addImage(imgData, 'JPEG', 0, 0);
                                        added++;
                                    }
                                    
                                    pdf.save("download.pdf");
                                    resolve({success: true, pageCount: added});
                                } catch (error) {
                                    reject({success: false, error: error.toString()});
                                }
                            };
                            
                            script.onerror = function() {
                                reject({success: false, error: "Failed to load jsPDF library"});
                            };
                            
                            script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
                            document.body.appendChild(script);
                        });
                    })();
                ''')
                
                if not result.get('success'):
                    logger.error(f"Error in PDF generation: {result.get('error')}")
                    return False
                
                logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
                
                # Wait for the download to complete and save the file
                download = await download_promise
                
                # Save the downloaded file to the specified path
                await download.save_as(save_path)
                logger.info(f"Successfully saved PDF to {save_path}")
                
                return os.path.exists(save_path) and os.path.getsize(save_path) > 1000
                
            finally:
                await browser.close()
                
        except Exception as e:
            logger.error(f"Error in viewonly PDF download process: {e}")
            return False

    async def _natural_scroll_through_document(self, page, estimated_pages):
        """Scroll through document in a natural way to load all pages"""
        logger.info("Scrolling through document to load all pages...")
        max_attempts = min(estimated_pages * 3, 300)
        attempt = 0
        prev_blob_count = 0
        consecutive_same_count = 0
        
        while attempt < max_attempts:
            # Count blob images (which are the PDF pages)
            blob_count = await page.evaluate("""
                Array.from(document.getElementsByTagName('img'))
                    .filter(img => img.src.startsWith('blob:') && img.width > 100)
                    .length
            """)
            
            logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
            
            # Check if we've loaded all pages or if we're stuck
            if blob_count >= estimated_pages:
                logger.info(f"All {estimated_pages} pages appear to be loaded.")
                break
            
            if blob_count == prev_blob_count:
                consecutive_same_count += 1
                if consecutive_same_count >= 5 and blob_count > 0:
                    logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.")
                    break
            else:
                consecutive_same_count = 0
            
            # Mix up the scrolling approach for more human-like behavior
            scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"])
            
            if scroll_action == "PageDown":
                await page.keyboard.press("PageDown")
            elif scroll_action == "End":
                await page.keyboard.press("End")
            elif scroll_action == "ArrowDown":
                # Press arrow down multiple times
                for _ in range(random.randint(5, 15)):
                    await page.keyboard.press("ArrowDown")
                    await page.wait_for_timeout(random.randint(50, 150))
            else:  # mouse
                # Scroll using mouse wheel
                current_y = random.randint(300, 700)
                await page.mouse.move(x=random.randint(300, 800), y=current_y)
                await page.mouse.wheel(0, random.randint(300, 800))
            
            # Random wait between scrolls
            await page.wait_for_timeout(random.randint(1000, 3000))
            
            prev_blob_count = blob_count
            attempt += 1
        
        # Extra wait to ensure everything is fully loaded
        await page.wait_for_timeout(5000)

    async def _export_google_doc(self, file_id, file_type, save_path):
        """Export Google Docs/Sheets/Slides to downloadable formats"""
        try:
            # Map file types to export formats
            export_urls = {
                'doc': f"https://docs.google.com/document/d/{file_id}/export?format=doc",
                'docx': f"https://docs.google.com/document/d/{file_id}/export?format=docx",
                'sheet': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx",
                'xlsx': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx",
                'ppt': f"https://docs.google.com/presentation/d/{file_id}/export/pptx",
                'pptx': f"https://docs.google.com/presentation/d/{file_id}/export/pptx",
                'pdf': f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
            }
            
            export_url = export_urls.get(file_type, f"https://docs.google.com/document/d/{file_id}/export?format=pdf")
            
            async with self.context.new_page() as page:
                # Get cookies from the main view page first
                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
                
                # Now try the export
                response = await page.goto(export_url, wait_until='networkidle')
                
                if response.status == 200:
                    content = await response.body()
                    with open(save_path, 'wb') as f:
                        f.write(content)
                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
                else:
                    logger.warning(f"Export failed with status {response.status}")
                    return False
                    
        except Exception as e:
            logger.error(f"Error exporting Google Doc: {e}")
            return False

    async def _get_google_drive_file_info(self, file_id):
        """Get file type and view-only status from Google Drive"""
        file_type = None
        is_view_only = False
        
        try:
            async with self.context.new_page() as page:
                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
                
                # Check if view-only
                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
                is_view_only = view_only_text is not None
                
                # Check for Google Docs viewer
                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
                
                if gdocs_viewer:
                    file_type = 'docx'
                elif gsheets_viewer:
                    file_type = 'xlsx'
                elif gslides_viewer:
                    file_type = 'pptx'
                else:
                    # Check for PDF viewer
                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
                    if pdf_viewer:
                        file_type = 'pdf'
                    else:
                        # Check for image viewer
                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
                        if img_viewer:
                            # Get image type from src
                            img_src = await img_viewer.get_attribute('src')
                            if 'jpg' in img_src or 'jpeg' in img_src:
                                file_type = 'jpg'
                            elif 'png' in img_src:
                                file_type = 'png'
                            else:
                                file_type = 'jpg'  # Default to jpg
                        else:
                            # Generic file type fallback
                            file_type = 'pdf'  # Default to PDF
                
                # If still no type, check filename
                if not file_type:
                    title_element = await page.query_selector('div[role="heading"]')
                    if title_element:
                        title = await title_element.text_content()
                        if title:
                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
                            if ext_match:
                                file_type = ext_match.group(1).lower()
        
        except Exception as e:
            logger.error(f"Error getting Google Drive file info: {e}")
            file_type = 'pdf'  # Default to PDF if we can't determine
        
        return file_type, is_view_only

    # IMPROVED: Enhanced sublink extraction method
    async def get_sublinks(self, url, limit=10000):
        """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
        links = set()
        try:
            logger.info(f"Fetching sublinks from: {url}")
            
            # Check if this is a direct download link
            if is_download_link(url):
                logger.info(f"URL appears to be a direct download link: {url}")
                links.add(url)
                return list(links)[:limit]
            
            # Skip if we've already visited this URL
            normalized_url = normalize_download_url(url)
            if normalized_url in self.visited_urls:
                logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}")
                return list(links)[:limit]
            
            # Add to visited URLs
            self.visited_urls.add(normalized_url)
            
            # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
            if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in 
                                                      ["exam", "test", "pastpaper", "eduexp"]):
                logger.info("Using specialized exam site sublink extraction")
                edu_links = await self.get_edu_exam_links(url)
                for link in edu_links:
                    links.add(link)
                
                # If we found a good number of links with the specialized method, return them
                if len(links) > 5:
                    logger.info(f"Found {len(links)} sublinks with specialized method")
                    return list(links)[:limit]
            
            # Rotate proxy if needed
            await self.rotate_proxy_if_needed()
            
            # Standard sublink extraction for all sites
            try:
                await self.page.goto(url, timeout=30000, wait_until='networkidle')
            except Exception as e:
                logger.warning(f"Error navigating to URL for sublink extraction: {e}")
                # Continue with what we have, we'll try to extract links anyway
                
            # Get base URL for resolving relative links
            parsed_base = urlparse(url)
            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
            path_base = os.path.dirname(parsed_base.path)
            
            # Perform initial scrolling to load lazy content
            await self.page.evaluate("""
                async () => {
                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
                    const height = document.body.scrollHeight;
                    const step = Math.floor(window.innerHeight / 2);
                    
                    for (let i = 0; i < height; i += step) {
                        window.scrollTo(0, i);
                        await delay(150);
                    }
                    
                    window.scrollTo(0, 0);
                }
            """)
            await self.page.wait_for_timeout(1000)
            
            # Check if page has ASP.NET elements which might need special handling
            is_aspnet = await self.page.evaluate('''
                () => {
                    return document.querySelector('form#aspnetForm') !== null || 
                           document.querySelector('input[name="__VIEWSTATE"]') !== null;
                }
            ''')
            
            if is_aspnet:
                logger.info("Detected ASP.NET page, using enhanced extraction method")
                
                # Try to interact with ASP.NET controls that might reveal more links
                # Look for dropdowns, buttons, and grid elements
                dropdowns = await self.page.query_selector_all('select')
                buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
                
                # Try interacting with dropdowns first
                for dropdown in dropdowns:
                    try:
                        # Get all options
                        options = await self.page.evaluate('''
                            (dropdown) => {
                                return Array.from(dropdown.options).map(o => o.value);
                            }
                        ''', dropdown)
                        
                        # Try selecting each option
                        for option in options:
                            if option:
                                await dropdown.select_option(value=option)
                                await self.page.wait_for_timeout(1000)
                                await self.page.wait_for_load_state('networkidle', timeout=5000)
                                
                                # Extract any new links that appeared
                                await self.extract_all_link_types(links, base_url, path_base)
                    except Exception as e:
                        logger.warning(f"Error interacting with dropdown: {e}")
                
                # Try clicking buttons (but avoid dangerous ones like "delete")
                safe_buttons = []
                for button in buttons:
                    button_text = await button.text_content() or ""
                    button_value = await button.get_attribute("value") or ""
                    button_id = await button.get_attribute("id") or ""
                    combined_text = (button_text + button_value + button_id).lower()
                    
                    # Skip potentially destructive buttons
                    if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
                        continue
                    
                    # Prioritize buttons that might show more content
                    if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
                        safe_buttons.append(button)
                
                # Click the safe buttons
                for button in safe_buttons[:5]:  # Limit to first 5 to avoid too many clicks
                    try:
                        await button.click()
                        await self.page.wait_for_timeout(1000)
                        await self.page.wait_for_load_state('networkidle', timeout=5000)
                        
                        # Extract any new links that appeared
                        await self.extract_all_link_types(links, base_url, path_base)
                    except Exception as e:
                        logger.warning(f"Error clicking button: {e}")
            
            # Extract links from the initial page state
            await self.extract_all_link_types(links, base_url, path_base)
            
            # Look specifically for links inside grid/table views which are common in ASP.NET applications
            grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
            for cell in grid_cells:
                try:
                    href = await cell.get_attribute('href')
                    if href:
                        full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                        links.add(full_url)
                except Exception as e:
                    logger.warning(f"Error extracting grid link: {e}")
            
            # Extract links from onclick attributes and javascript:__doPostBack calls
            postback_links = await self.page.evaluate('''
                () => {
                    const results = [];
                    // Find elements with onclick containing __doPostBack
                    const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
                    for (const el of elements) {
                        // Extract the postback target
                        const onclick = el.getAttribute('onclick') || '';
                        const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
                        if (match && match[1]) {
                            // Get the visible text to use as description
                            const text = el.innerText || el.textContent || 'Link';
                            results.push({
                                id: match[1],
                                text: text.trim()
                            });
                        }
                    }
                    return results;
                }
            ''')
            
            # Try interacting with some of the postback links
            for postback in postback_links[:10]:  # Limit to first 10 to avoid too many interactions
                try:
                    logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
                    await self.page.evaluate(f'''
                        () => {{
                            if (typeof __doPostBack === 'function') {{
                                __doPostBack('{postback["id"]}', '');
                            }}
                        }}
                    ''')
                    await self.page.wait_for_timeout(1500)
                    await self.page.wait_for_load_state('networkidle', timeout=5000)
                    
                    # Extract any new links that appeared
                    await self.extract_all_link_types(links, base_url, path_base)
                except Exception as e:
                    logger.warning(f"Error with postback: {e}")
            
            # Look for pagination controls and try to navigate through them
            pagination_elements = await self.page.query_selector_all(
                'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]'
            )
            
            # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops)
            for i in range(min(5, len(pagination_elements))):
                try:
                    # Focus on elements that look like "next page" buttons
                    el = pagination_elements[i]
                    el_text = await el.text_content() or ""
                    
                    # Only click if this looks like a pagination control
                    if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip():
                        logger.info(f"Clicking pagination control: {el_text}")
                        await el.click()
                        await self.page.wait_for_timeout(2000)
                        await self.page.wait_for_load_state('networkidle', timeout=5000)
                        
                        # Get new links from this page
                        await self.extract_all_link_types(links, base_url, path_base)
                except Exception as e:
                    logger.warning(f"Error clicking pagination: {e}")
            
            # Check for hidden links that might be revealed by JavaScript
            hidden_links = await self.page.evaluate("""
                () => {
                    // Try to execute common JavaScript patterns that reveal hidden content
                    try {
                        // Common patterns used in websites to initially hide content
                        const hiddenContainers = document.querySelectorAll(
                            '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]'
                        );
                        
                        // Attempt to make them visible
                        hiddenContainers.forEach(el => {
                            el.style.display = 'block';
                            el.style.visibility = 'visible';
                            el.classList.remove('hidden', 'hide');
                        });
                        
                        // Return any newly visible links
                        return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
                    } catch (e) {
                        return [];
                    }
                }
            """)
            
            # Add any newly discovered links
            for href in hidden_links:
                if href and not href.startswith('javascript:'):
                    links.add(href)
            
            # Find all download links
            download_links = await self.page.evaluate("""
                () => {
                    return Array.from(document.querySelectorAll('a[href]'))
                        .filter(a => {
                            const href = a.href.toLowerCase();
                            return href.includes('download') || 
                                   href.includes('file') || 
                                   href.includes('get') ||
                                   href.includes('view.php') ||
                                   href.includes('action=') ||
                                   href.includes('fname=');
                        })
                        .map(a => a.href);
                }
            """)
            
            for download_link in download_links:
                links.add(download_link)
                
            # Also check for hidden links in JavaScript, iframes, or dynamic content
            js_links = await self.discover_hidden_links(self.page)
            for link in js_links:
                links.add(link)
                
            logger.info(f"Found {len(links)} sublinks")
            
            # Prioritize download links
            prioritized_links = []
            normal_links = []
            
            for link in links:
                if is_download_link(link):
                    prioritized_links.append(link)
                else:
                    normal_links.append(link)
                    
            # Return prioritized links first, then normal links, up to the limit
            result = prioritized_links + normal_links
            return result[:limit]
        
        except Exception as e:
            logger.error(f"Error getting sublinks from {url}: {e}")
            return list(links)[:limit]  # Return what we have so far

    async def extract_all_link_types(self, links_set, base_url, path_base):
        """Extract all types of links from the current page"""
        # Get all <a> tag links
        a_links = await self.page.query_selector_all('a[href]')
        for a in a_links:
            try:
                href = await a.get_attribute('href')
                if href and not href.startswith('javascript:') and not href.startswith('#'):
                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                    links_set.add(full_url)
            except Exception:
                pass
        
        # Get iframe sources
        iframes = await self.page.query_selector_all('iframe[src]')
        for iframe in iframes:
            try:
                src = await iframe.get_attribute('src')
                if src and not src.startswith('javascript:') and not src.startswith('about:'):
                    full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
                    links_set.add(full_url)
            except Exception:
                pass
        
        # Get links from onclick attributes that reference URLs
        onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
        for el in onclick_elements:
            try:
                onclick = await el.get_attribute('onclick')
                urls = re.findall(r'(https?://[^\'"]+)', onclick)
                for url in urls:
                    links_set.add(url)
            except Exception:
                pass
        
        # Look for URLs in data-* attributes
        data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
        for el in data_elements:
            for attr in ['data-url', 'data-href', 'data-src']:
                try:
                    value = await el.get_attribute(attr)
                    if value and not value.startswith('javascript:'):
                        full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
                        links_set.add(full_url)
                except Exception:
                    pass
        
        # Look for special anchor links that might not have href attributes
        special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
        for anchor in special_anchors:
            try:
                href = await anchor.get_attribute('href')
                if href and not href.startswith('javascript:') and not href.startswith('#'):
                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                    links_set.add(full_url)
            except Exception:
                pass
        
        # Extract links from JSON data embedded in the page
        script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]')
        for script in script_elements:
            try:
                script_content = await script.text_content()
                if script_content:
                    # Look for URLs in the JSON content
                    urls = re.findall(r'(https?://[^\'"]+)', script_content)
                    for url in urls:
                        links_set.add(url)
            except Exception:
                pass

    def resolve_relative_url(self, relative_url, base_url, path_base):
        """Properly resolve relative URLs considering multiple formats"""
        if relative_url.startswith('/'):
            # Absolute path relative to domain
            return f"{base_url}{relative_url}"
        elif relative_url.startswith('./'):
            # Explicit relative path
            return f"{base_url}{path_base}/{relative_url[2:]}"
        elif relative_url.startswith('../'):
            # Parent directory
            parent_path = '/'.join(path_base.split('/')[:-1])
            return f"{base_url}{parent_path}/{relative_url[3:]}"
        else:
            # Regular relative path
            return f"{base_url}{path_base}/{relative_url}"

    async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
        """Perform a deep search for files at the URL and its sublinks"""
        import streamlit as st
        
        if not custom_ext_list:
            custom_ext_list = []
        progress_text = st.empty()
        progress_bar = st.progress(0)
        file_count_text = st.empty()
        
        try:
            # Reset the visited URLs for a fresh deep search
            self.visited_urls = set()
            
            progress_text.text("🔍 Analyzing main page...")
            # Special handling for ASP.NET pages
            is_aspnet = False
            try:
                await self.page.goto(url, timeout=30000, wait_until='networkidle')
                is_aspnet = await self.page.evaluate('''
                    () => {
                        return document.querySelector('form#aspnetForm') !== null || 
                               document.querySelector('input[name="__VIEWSTATE"]') !== null;
                    }
                ''')
            except Exception:
                pass
            
            # Check if this URL is a direct download
            if is_download_link(url):
                progress_text.text("📥 URL appears to be a direct download. Processing...")
                
                # Try to extract file directly
                normalized_url = normalize_download_url(url)
                file_info = {
                    'url': normalized_url,
                    'download_url': normalized_url,
                    'filename': os.path.basename(urlparse(normalized_url).path) or 'download',
                    'size': 'Unknown Size',
                    'metadata': {}
                }
                
                # Add to visited URLs
                self.visited_urls.add(normalized_url)
                progress_bar.progress(1.0)
                return [file_info]
            
            # Extract files from main page
            progress_text.text("📄 Extracting files from main page...")
            main_files = await self.extract_downloadable_files(url, custom_ext_list)
            initial_count = len(main_files)
            file_count_text.text(f"Found {initial_count} files on main page")
            
            # Get sublinks with enhanced method
            progress_text.text("🔗 Getting sublinks...")
            sublinks = await self.get_sublinks(url, sublink_limit)
            total_links = len(sublinks)
            progress_text.text(f"Found {total_links} sublinks to process")
            
            # Always include files from the main page, regardless of sublinks
            all_files = main_files
            
            if not sublinks:
                progress_bar.progress(1.0)
                return all_files
            
            # Process each sublink
            for i, sublink in enumerate(sublinks, 1):
                progress = i / total_links
                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
                progress_bar.progress(progress)
                
                try:
                    # Check if this is a direct download link
                    if is_download_link(sublink):
                        # For download links, just add the link directly
                        normalized_url = normalize_download_url(sublink)
                        
                        # Skip if already visited
                        if normalized_url in self.visited_urls:
                            continue
                            
                        # Mark as visited
                        self.visited_urls.add(normalized_url)
                        
                        # Get file size if possible
                        size_str = await self.get_file_size(normalized_url)
                        
                        # Get filename, with fallback to domain-based name
                        filename = os.path.basename(urlparse(normalized_url).path)
                        if not filename or filename == '/' or '?' in filename:
                            domain = get_domain(normalized_url)
                            ext = '.pdf'  # Default extension
                            for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']:
                                if common_ext in normalized_url.lower():
                                    ext = common_ext
                                    break
                            filename = f"file_from_{domain}{ext}"
                        
                        # Add file to results
                        all_files.append({
                            'url': normalized_url,
                            'download_url': normalized_url,
                            'filename': filename,
                            'size': size_str,
                            'metadata': {}
                        })
                        file_count_text.text(f"Found {len(all_files)} total files")
                        continue
                        
                    # For regular links, use a longer timeout for ASP.NET pages which can be slower
                    sub_timeout = timeout * 2 if is_aspnet else timeout
                    
                    # Skip already visited URLs
                    if sublink in self.visited_urls:
                        continue
                    
                    # Extract files from sublink
                    sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                    all_files.extend(sub_files)
                    file_count_text.text(f"Found {len(all_files)} total files")
                except Exception as e:
                    logger.warning(f"Error processing sublink {sublink}: {e}")
            
            # Deduplicate files
            seen_urls = set()
            unique_files = []
            for f in all_files:
                if f['url'] not in seen_urls:
                    seen_urls.add(f['url'])
                    unique_files.append(f)
            
            final_count = len(unique_files)
            progress_text.text(f"✅ Deep search complete!")
            file_count_text.text(f"Found {final_count} unique files")
            progress_bar.progress(1.0)
            return unique_files
        
        except Exception as e:
            logger.error(f"Deep search error: {e}")
            progress_text.text(f"⚠️ Error during deep search: {str(e)}")
            return []
        
        finally:
            await asyncio.sleep(2)
            if not st.session_state.get('keep_progress', False):
                progress_text.empty()
                progress_bar.empty()