Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 8

Commit

942484e

verified ·

1 Parent(s): 82c1030

Update app.py

Browse files

Files changed (1) hide show

app.py +1451 -1

app.py CHANGED Viewed

@@ -1701,8 +1701,1458 @@ class DownloadManager:
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
-                progress_bar.empty()
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)

             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
+                progress_bar.empty()class DownloadManager:
+    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
+        self.use_proxy = use_proxy
+        self.proxy = proxy
+        self.query = query
+        self.num_results = num_results
+        self.playwright = None
+        self.browser = None
+        self.context = None
+        self.page = None
+    async def __aenter__(self):
+        self.playwright = await async_playwright().start()
+        opts = {
+            "headless": True,
+            "args": [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--no-zygote',
+                '--single-process'
+            ]
+        }
+        if self.use_proxy and self.proxy:
+            opts["proxy"] = {"server": self.proxy}
+        self.browser = await self.playwright.chromium.launch(**opts)
+        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
+        self.page = await self.context.new_page()
+        await self.page.set_extra_http_headers({
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Referer': 'https://www.bing.com/'
+        })
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def search_bing(self):
+        urls = []
+        try:
+            search_url = f"https://www.bing.com/search?q={self.query}"
+            await self.page.goto(search_url, timeout=30000)
+            await self.page.wait_for_load_state('networkidle')
+            links = await self.page.query_selector_all("li.b_algo h2 a")
+            for link in links[:self.num_results]:
+                href = await link.get_attribute('href')
+                if href:
+                    urls.append(href)
+            return urls
+        except Exception as e:
+            logger.error(f"Error searching Bing: {e}")
+            return []
+    async def get_file_size(self, url):
+        try:
+            async with self.context.new_page() as page:
+                response = await page.request.head(url, timeout=15000)
+                length = response.headers.get('Content-Length', None)
+                if length:
+                    return sizeof_fmt(int(length))
+                else:
+                    return "Unknown Size"
+        except Exception:
+            return "Unknown Size"
+    async def get_pdf_metadata(self, url):
+        try:
+            async with self.context.new_page() as page:
+                resp = await page.request.get(url, timeout=15000)
+                if resp.ok:
+                    content = await resp.body()
+                    pdf = BytesIO(content)
+                    reader = PdfReader(pdf)
+                    return {
+                        'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
+                        'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
+                        'Pages': len(reader.pages),
+                    }
+                else:
+                    return {}
+        except Exception:
+            return {}
+    async def extract_real_download_url(self, url):
+        try:
+            async with self.context.new_page() as page:
+                response = await page.goto(url, wait_until='networkidle', timeout=30000)
+                if response and response.headers.get('location'):
+                    return response.headers['location']
+                return page.url
+        except Exception as e:
+            logger.error(f"Error extracting real download URL: {e}")
+            return url
+    async def extract_downloadable_files(self, url, custom_ext_list):
+        found_files = []
+        try:
+            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            if not response:
+                return []
+            final_url = self.page.url
+            if '.php' in final_url or 'download' in final_url:
+                real_url = await self.extract_real_download_url(final_url)
+                if real_url != final_url:
+                    found_files.append({
+                        'url': real_url,
+                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                        'size': await self.get_file_size(real_url),
+                        'metadata': {}
+                    })
+                    return found_files
+            await self.page.wait_for_load_state('networkidle', timeout=30000)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
+                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
+                            '.pptx', '.odt', '.txt']
+            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
+            parsed_base = urlparse(final_url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            path_base = os.path.dirname(parsed_base.path)
+            # Process all anchor tags
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    real_url = await self.extract_real_download_url(full_url)
+                    if real_url and real_url != full_url:
+                        found_files.append({
+                            'url': real_url,
+                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                            'size': await self.get_file_size(real_url),
+                            'metadata': {}
+                        })
+                        continue
+                if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+                # Handle Google Drive links
+                elif ("drive.google.com" in href) or ("docs.google.com" in href):
+                    file_id = None
+                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                        match = re.search(pattern, href)
+                        if match:
+                            file_id = match.group(1)
+                            break
+                    if file_id:
+                        # Get file info to determine type and view-only status
+                        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+                        # Create a more informative filename based on info
+                        filename = f"gdrive_{file_id}"
+                        if file_type:
+                            filename = f"{filename}.{file_type}"
+                        size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
+                        found_files.append({
+                            'url': href,  # Use original URL
+                            'filename': filename,
+                            'size': size_str,
+                            'metadata': {
+                                'view_only': is_view_only,
+                                'file_type': file_type,
+                                'file_id': file_id
+                            }
+                        })
+            # Also check for files in other elements (iframe, embed, object, etc.)
+            other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
+            for elem in other_elements:
+                src = elem.get('src') or elem.get('data')
+                if src and any(src.lower().endswith(ext) for ext in all_exts):
+                    file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+            # Check for file links in onclick attributes
+            onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
+            for elem in onclick_elements:
+                onclick = await elem.get_attribute('onclick')
+                urls = re.findall(r'(https?://[^\'"]+)', onclick)
+                for url_match in urls:
+                    if any(url_match.lower().endswith(ext) for ext in all_exts):
+                        size_str = await self.get_file_size(url_match)
+                        meta = {}
+                        if url_match.lower().endswith('.pdf'):
+                            meta = await self.get_pdf_metadata(url_match)
+                        found_files.append({
+                            'url': url_match,
+                            'filename': os.path.basename(url_match.split('?')[0]),
+                            'size': size_str,
+                            'metadata': meta
+                        })
+            seen_urls = set()
+            unique_files = []
+            for f in found_files:
+                if f['url'] not in seen_urls:
+                    seen_urls.add(f['url'])
+                    unique_files.append(f)
+            return unique_files
+        except Exception as e:
+            logger.error(f"Error extracting files from {url}: {e}")
+            return []
+    async def download_file(self, file_info, save_dir, referer):
+        file_url = file_info['url']
+        fname = file_info['filename']
+        path = os.path.join(save_dir, fname)
+        base, ext = os.path.splitext(fname)
+        counter = 1
+        while os.path.exists(path):
+            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+            counter += 1
+        os.makedirs(save_dir, exist_ok=True)
+        try:
+            # Special handling for Google Drive files
+            if "drive.google.com" in file_url or "docs.google.com" in file_url:
+                # Check if it's marked as view-only in metadata
+                is_view_only = file_info.get('metadata', {}).get('view_only', False)
+                # For view-only files, try our most robust approach first
+                if is_view_only:
+                    logger.info(f"Attempting to download view-only file: {file_url}")
+                    result_path = await self.force_download_viewonly(file_info, path)
+                    if result_path:
+                        return result_path
+                    # If that failed, try the regular download approach
+                    logger.info("Primary method failed, trying fallback methods")
+                # Try regular download methods
+                success = await self.download_from_google_drive(file_url, path)
+                if success:
+                    return path
+                # If all methods failed for Google Drive, try one last approach
+                logger.warning("All standard methods failed, attempting force download")
+                result_path = await self.force_download_viewonly(file_info, path)
+                return result_path if result_path else None
+            # Original code for non-Google Drive downloads
+            async with self.context.new_page() as page:
+                headers = {
+                    'Accept': '*/*',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Referer': referer
+                }
+                response = await page.request.get(file_url, headers=headers, timeout=30000)
+                if response.status == 200:
+                    content = await response.body()
+                    with open(path, 'wb') as f:
+                        f.write(content)
+                    return path
+                else:
+                    logger.error(f"Download failed with status {response.status}: {file_url}")
+                    return None
+        except Exception as e:
+            logger.error(f"Error downloading {file_url}: {e}")
+            return None
+    async def force_download_viewonly(self, file_info, save_path):
+        """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
+        try:
+            # Extract file ID
+            file_id = file_info.get('metadata', {}).get('file_id')
+            if not file_id:
+                url = file_info['url']
+                for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                    match = re.search(pattern, url)
+                    if match:
+                        file_id = match.group(1)
+                        break
+            if not file_id:
+                logger.error("Could not extract file ID")
+                return None
+            file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
+            base, ext = os.path.splitext(save_path)
+            if not ext:
+                save_path = f"{base}.{file_type}"
+            logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
+            # Create a dedicated browser instance with better resolution
+            browser = await self.playwright.chromium.launch(
+                headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-web-security',
+                    '--disable-features=IsolateOrigins,site-per-process',
+                    '--disable-site-isolation-trials'
+                ]
+            )
+            # Use higher resolution for better quality
+            context = await browser.new_context(
+                viewport={'width': 1600, 'height': 1200},
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                device_scale_factor=2.0
+            )
+            page = await context.new_page()
+            try:
+                # Go to the file view page
+                logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(5000)  # Wait longer for everything to load
+                # Create temp directory
+                temp_dir = tempfile.mkdtemp()
+                # Special handling for PDFs
+                if file_type.lower() == 'pdf':
+                    # Check if there's a pagination control
+                    pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
+                    # Try multiple methods to extract total pages
+                    total_pages = await page.evaluate("""
+                    () => {
+                        // Method 1: Check page counter text
+                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                            const text = el.textContent || '';
+                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                        });
+                        if (pageCounters.length > 0) {
+                            const text = pageCounters[0].textContent || '';
+                            const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+                            if (match && match[2]) return parseInt(match[2]);
+                        }
+                        // Method 2: Check actual page elements
+                        const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
+                        if (pageElements.length > 0) return pageElements.length;
+                        // Method 3: Look for page thumbnails
+                        const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
+                        if (thumbnails.length > 0) return thumbnails.length;
+                        // Fallback: conservative guess based on UI
+                        return 50; // Safe default when we can't determine
+                    }
+                    """)
+                    logger.info(f"Detected {total_pages} pages in PDF")
+                    if total_pages <= 1:
+                        # Additional check - sometimes the page count detection fails
+                        # Let's double-check by looking for next/previous buttons
+                        next_button = await page.query_selector('button[aria-label="Next page"]')
+                        if next_button:
+                            disabled = await next_button.get_attribute('disabled')
+                            if not disabled:
+                                logger.info("Found next button that's not disabled, document has multiple pages")
+                                total_pages = 100  # Set a high number, we'll stop when we can't go further
+                    # If we still think it's a single page, use a more direct approach
+                    if total_pages <= 1:
+                        # Single page approach
+                        logger.info("Using single-page capture approach")
+                        # Take a screenshot of the current view (should be the full document or first page)
+                        screenshot_path = os.path.join(temp_dir, "page.png")
+                        # Try to screenshot just the document area if we can find it
+                        document_area = await page.query_selector('.drive-viewer-paginated-page')
+                        if document_area:
+                            await document_area.screenshot(path=screenshot_path)
+                        else:
+                            # Otherwise take a full screenshot
+                            await page.screenshot(path=screenshot_path)
+                        # Convert to PDF
+                        from PIL import Image
+                        from reportlab.pdfgen import canvas as pdf_canvas
+                        img = Image.open(screenshot_path)
+                        width, height = img.size
+                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
+                        c.drawImage(screenshot_path, 0, 0, width, height)
+                        c.save()
+                        os.remove(screenshot_path)
+                        os.rmdir(temp_dir)
+                        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                            return save_path
+                        return None
+                    # Multi-page approach
+                    logger.info(f"Using multi-page capture approach for {total_pages} pages")
+                    # CRITICAL: We need to go to the first page first
+                    # Check if we need to reset to first page
+                    current_page_text = await page.evaluate("""
+                    () => {
+                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                            const text = el.textContent || '';
+                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                        });
+                        if (pageCounters.length > 0) {
+                            return pageCounters[0].textContent || '';
+                        }
+                        return '';
+                    }
+                    """)
+                    current_page = 1
+                    if current_page_text:
+                        match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
+                        if match:
+                            current_page = int(match.group(1))
+                    # If we're not on page 1, go back to first page
+                    if current_page > 1:
+                        logger.info(f"Currently on page {current_page}, navigating back to page 1")
+                        # Look for an input field where we can directly set the page number
+                        page_input = await page.query_selector('input[aria-label="Page"]')
+                        if page_input:
+                            await page_input.fill("1")
+                            await page_input.press("Enter")
+                            await page.wait_for_timeout(1000)
+                        else:
+                            # Use prev button to go back to first page
+                            prev_button = await page.query_selector('button[aria-label="Previous page"]')
+                            if prev_button:
+                                # Keep clicking until we can't anymore
+                                for _ in range(current_page - 1):
+                                    try:
+                                        await prev_button.click()
+                                        await page.wait_for_timeout(500)
+                                    except Exception as e:
+                                        logger.warning(f"Error clicking prev button: {e}")
+                                        break
+                    # Capture each page
+                    screenshots = []
+                    page_num = 1
+                    max_tries = min(total_pages + 10, 200)  # Set a reasonable limit
+                    next_button = await page.query_selector('button[aria-label="Next page"]')
+                    # Maximize the PDF view if possible
+                    await page.evaluate("""
+                    () => {
+                        // Try to find and click any "full page" or "maximize" buttons
+                        const fullViewButtons = Array.from(document.querySelectorAll('button'))
+                                              .filter(b => b.textContent?.includes('Full') ||
+                                                          b.getAttribute('aria-label')?.includes('Full') ||
+                                                          b.getAttribute('aria-label')?.includes('fit page'));
+                        if (fullViewButtons.length > 0) {
+                            fullViewButtons[0].click();
+                        }
+                    }
+                    """)
+                    await page.wait_for_timeout(1000)  # Wait for view to adjust
+                    while page_num <= max_tries:
+                        # Wait for the page to be fully loaded
+                        await page.wait_for_timeout(800)
+                        # Take a screenshot of the current page
+                        screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
+                        # Try different methods to identify and capture just the page content
+                        page_content = await page.query_selector('.drive-viewer-paginated-page')
+                        if page_content:
+                            # Found the specific page element
+                            await page_content.screenshot(path=screenshot_path)
+                        else:
+                            # Fall back to screenshot of visible viewport
+                            await page.screenshot(path=screenshot_path)
+                        screenshots.append(screenshot_path)
+                        logger.info(f"Captured page {page_num}")
+                        # Check if we have a disabled next button (reached the end)
+                        if next_button:
+                            is_disabled = await next_button.get_attribute('disabled')
+                            if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
+                                logger.info(f"Reached end of document after {page_num} pages")
+                                break
+                            # Click the next button
+                            try:
+                                await next_button.click()
+                                await page.wait_for_timeout(800)  # Wait for page transition
+                                page_num += 1
+                            except Exception as e:
+                                logger.error(f"Error clicking next button: {e}")
+                                # Try to get a fresh reference to the button
+                                next_button = await page.query_selector('button[aria-label="Next page"]')
+                                if not next_button:
+                                    logger.warning("Next button disappeared, assuming end of document")
+                                    break
+                        else:
+                            # Try to find the next button again
+                            next_button = await page.query_selector('button[aria-label="Next page"]')
+                            if not next_button:
+                                logger.warning("Could not find next button, stopping navigation")
+                                break
+                        # Double-check if we've reached the expected total
+                        if page_num >= total_pages:
+                            logger.info(f"Reached expected total of {total_pages} pages")
+                            break
+                    # Combine screenshots into PDF
+                    logger.info(f"Creating PDF from {len(screenshots)} captured pages")
+                    from PIL import Image
+                    from reportlab.lib.pagesizes import letter
+                    from reportlab.pdfgen import canvas as pdf_canvas
+                    # Use the size of the first screenshot to set PDF dimensions
+                    if screenshots:
+                        try:
+                            img = Image.open(screenshots[0])
+                            width, height = img.size
+                            c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
+                            for screenshot in screenshots:
+                                try:
+                                    if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
+                                        img = Image.open(screenshot)
+                                        c.drawImage(screenshot, 0, 0, width, height)
+                                        c.showPage()
+                                except Exception as e:
+                                    logger.error(f"Error adding page to PDF: {e}")
+                            c.save()
+                            # Clean up screenshots
+                            for screenshot in screenshots:
+                                if os.path.exists(screenshot):
+                                    os.remove(screenshot)
+                            logger.info(f"Successfully created PDF with {len(screenshots)} pages")
+                        except Exception as e:
+                            logger.error(f"Error creating PDF: {e}")
+                    else:
+                        logger.error("No screenshots captured to create PDF")
+                else:
+                    # Non-PDF file handling
+                    screenshot_path = os.path.join(temp_dir, "file.png")
+                    await page.screenshot(path=screenshot_path)
+                    if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
+                        # For document types, try to export directly
+                        await self.export_google_doc(file_id, file_type, save_path)
+                    else:
+                        # For other types, save the screenshot with appropriate extension
+                        shutil.copy(screenshot_path, save_path)
+                    os.remove(screenshot_path)
+                # Clean up temp directory
+                try:
+                    os.rmdir(temp_dir)
+                except:
+                    pass
+                # Close browser
+                await browser.close()
+                # Verify file exists and has content
+                if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
+                    logger.info(f"Successfully downloaded file to {save_path}")
+                    return save_path
+                else:
+                    logger.error(f"Generated file is too small or missing: {save_path}")
+                    return None
+            except Exception as e:
+                logger.error(f"Error during force download: {e}")
+                if browser:
+                    await browser.close()
+                return None
+        except Exception as e:
+            logger.error(f"Force download preparation failed: {e}")
+            return None
+    async def download_from_google_drive(self, url, save_path):
+        """Enhanced method to download from Google Drive with multiple fallback approaches"""
+        # Extract the file ID from different URL formats
+        file_id = None
+        url_patterns = [
+            r'drive\.google\.com/file/d/([^/]+)',
+            r'drive\.google\.com/open\?id=([^&]+)',
+            r'docs\.google\.com/\w+/d/([^/]+)',
+            r'id=([^&]+)',
+            r'drive\.google\.com/uc\?id=([^&]+)',
+        ]
+        for pattern in url_patterns:
+            match = re.search(pattern, url)
+            if match:
+                file_id = match.group(1)
+                break
+        if not file_id:
+            logger.error(f"Could not extract file ID from URL: {url}")
+            return False
+        # Determine file type first (important for handling different file types)
+        file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+        logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
+        base, ext = os.path.splitext(save_path)
+        if not ext and file_type:
+            # Add the correct extension if missing
+            save_path = f"{base}.{file_type}"
+        # For view-only files, use specialized approaches
+        if is_view_only:
+            # Approach 1: For PDFs, use the JS method
+            if file_type == 'pdf':
+                success = await self.download_viewonly_pdf_with_js(file_id, save_path)
+                if success:
+                    return True
+            # Approach 2: For Google Docs, Sheets, etc., use export API
+            if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
+                success = await self.export_google_doc(file_id, file_type, save_path)
+                if success:
+                    return True
+            # Approach 3: Try the direct screenshot method for any view-only file
+            success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
+            if success:
+                return True
+        # Try standard approaches for non-view-only files
+        try:
+            # Try with gdown first
+            import gdown
+            output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
+            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                with open(save_path, 'rb') as f:
+                    content = f.read(100)  # Read first 100 bytes
+                    if b'<!DOCTYPE html>' not in content:  # Check not HTML error page
+                        logger.info(f"Successfully downloaded with gdown: {url}")
+                        return True
+        except Exception as e:
+            logger.warning(f"gdown download failed: {e}")
+        # Try with requests and session cookies
+        try:
+            session = requests.Session()
+            session.headers.update({'User-Agent': get_random_user_agent()})
+            # Visit the page first to get cookies
+            session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
+            # Try download
+            url = f"https://drive.google.com/uc?id={file_id}&export=download"
+            response = session.get(url, stream=True, timeout=30)
+            # Check for confirmation token
+            confirmation_token = None
+            for k, v in response.cookies.items():
+                if k.startswith('download_warning'):
+                    confirmation_token = v
+                    break
+            # Use confirmation token if found
+            if confirmation_token:
+                url = f"{url}&confirm={confirmation_token}"
+                response = session.get(url, stream=True, timeout=60)
+            # Check if we're getting HTML instead of the file
+            content_type = response.headers.get('Content-Type', '')
+            if 'text/html' in content_type:
+                logger.warning("Received HTML instead of file - likely download restriction")
+            else:
+                with open(save_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=1024*1024):
+                        if chunk:
+                            f.write(chunk)
+                if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                    with open(save_path, 'rb') as f:
+                        content = f.read(100)
+                        if b'<!DOCTYPE html>' not in content:
+                            logger.info("Successfully downloaded with requests session")
+                            return True
+        except Exception as e:
+            logger.warning(f"Requests session download failed: {e}")
+        logger.warning("Standard download methods failed")
+        return False
+    async def download_viewonly_pdf_with_js(self, file_id, save_path):
+        """Download view-only PDF using the enhanced blob image caching technique"""
+        try:
+            # Create a dedicated browser instance
+            browser = await self.playwright.chromium.launch(
+                headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-web-security'
+                ]
+            )
+            context = await browser.new_context(
+                viewport={'width': 1600, 'height': 1200},
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                accept_downloads=True  # Critical for handling the download event
+            )
+            page = await context.new_page()
+            try:
+                # Step 1: Navigate to the file
+                logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(5000)  # Initial wait for content to load
+                # Step 2: Estimate the number of pages
+                estimated_pages = await page.evaluate("""
+                    () => {
+                        // Look for page counter in the interface
+                        const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                            const text = el.textContent || '';
+                            return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                        });
+                        if (pageCounters.length > 0) {
+                            const text = pageCounters[0].textContent || '';
+                            const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+                            if (match && match[2]) return parseInt(match[2]);
+                        }
+                        // If we can't find a counter, check actual pages
+                        const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                        if (pages.length > 0) return pages.length;
+                        // Default to a reasonable number if we can't determine
+                        return 50;
+                    }
+                """)
+                logger.info(f"Estimated number of pages: {estimated_pages}")
+                # Step 3: Initial scroll to trigger loading
+                logger.info("Initial scroll to bottom to trigger lazy loading...")
+                await page.keyboard.press("End")
+                await page.wait_for_timeout(3000)
+                # Step 4: Wait for all pages to load by pressing PageDown and checking blob images
+                logger.info("Waiting for all pages to load...")
+                max_attempts = min(estimated_pages * 3, 300)  # Adjust based on document size
+                attempt = 0
+                while attempt < max_attempts:
+                    # Count blob images (which are the PDF pages)
+                    blob_count = await page.evaluate("""
+                        Array.from(document.getElementsByTagName('img'))
+                            .filter(img => img.src.startsWith('blob:') && img.width > 100)
+                            .length
+                    """)
+                    logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
+                    # If we've loaded enough pages or reached estimated count
+                    if blob_count >= estimated_pages:
+                        logger.info("All pages appear to be loaded.")
+                        break
+                    # Press PageDown to scroll further and trigger more loading
+                    await page.keyboard.press("PageDown")
+                    await page.wait_for_timeout(2000)  # Wait for content to load
+                    attempt += 1
+                # Extra wait to ensure everything is fully loaded
+                await page.wait_for_timeout(5000)
+                # Step 5: Set up a download event listener
+                download_promise = page.wait_for_event("download")
+                # Step 6: Inject the jsPDF script to generate PDF
+                logger.info("Generating PDF from loaded pages...")
+                result = await page.evaluate(r'''
+                    (function() {
+                        return new Promise((resolve, reject) => {
+                            let script = document.createElement("script");
+                            script.onload = function () {
+                                try {
+                                    let pdf = new jsPDF();
+                                    let imgs = document.getElementsByTagName("img");
+                                    let added = 0;
+                                    // First collect and sort all valid blob images
+                                    let validImages = [];
+                                    for (let i = 0; i < imgs.length; i++) {
+                                        let img = imgs[i];
+                                        if (!/^blob:/.test(img.src)) continue;
+                                        if (img.width < 100 || img.height < 100) continue;
+                                        validImages.push(img);
+                                    }
+                                    // Sort by vertical position
+                                    validImages.sort((a, b) => {
+                                        const rectA = a.getBoundingClientRect();
+                                        const rectB = b.getBoundingClientRect();
+                                        return rectA.top - rectB.top;
+                                    });
+                                    console.log(`Found ${validImages.length} valid page images to add to PDF`);
+                                    // Process each image as a page
+                                    for (let i = 0; i < validImages.length; i++) {
+                                        let img = validImages[i];
+                                        let canvas = document.createElement("canvas");
+                                        let ctx = canvas.getContext("2d");
+                                        canvas.width = img.width;
+                                        canvas.height = img.height;
+                                        ctx.drawImage(img, 0, 0, img.width, img.height);
+                                        let imgData = canvas.toDataURL("image/jpeg", 1.0);
+                                        if (added > 0) {
+                                            pdf.addPage();
+                                        }
+                                        pdf.addImage(imgData, 'JPEG', 0, 0);
+                                        added++;
+                                    }
+                                    pdf.save("download.pdf");
+                                    resolve({success: true, pageCount: added});
+                                } catch (error) {
+                                    reject({success: false, error: error.toString()});
+                                }
+                            };
+                            script.onerror = function() {
+                                reject({success: false, error: "Failed to load jsPDF library"});
+                            };
+                            // Use a reliable CDN
+                            script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
+                            document.body.appendChild(script);
+                        });
+                    })();
+                ''')
+                if not result.get('success'):
+                    logger.error(f"Error in PDF generation: {result.get('error')}")
+                    return False
+                logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
+                # Step 7: Wait for the download to complete and save the file
+                download = await download_promise
+                # Step 8: Save the downloaded file to the specified path
+                await download.save_as(save_path)
+                logger.info(f"Successfully saved PDF to {save_path}")
+                return os.path.exists(save_path) and os.path.getsize(save_path) > 1000
+            finally:
+                await browser.close()
+        except Exception as e:
+            logger.error(f"Error in viewonly PDF download process: {e}")
+            return False
+    async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
+        """Download any view-only file by taking screenshots"""
+        try:
+            async with self.context.new_page() as page:
+                # Set high-resolution viewport
+                await page.set_viewport_size({"width": 1600, "height": 1200})
+                # Navigate to the file
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
+                # Make sure the file is loaded
+                await page.wait_for_load_state('networkidle')
+                await page.wait_for_timeout(3000)  # Extra time for rendering
+                # Create directory for screenshots if multiple pages
+                base_dir = os.path.dirname(save_path)
+                base_name = os.path.splitext(os.path.basename(save_path))[0]
+                screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
+                os.makedirs(screenshots_dir, exist_ok=True)
+                # Check if it's a multi-page document
+                is_multi_page = await page.evaluate("""
+                    () => {
+                        const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                        return pages.length > 1;
+                    }
+                """)
+                if is_multi_page and file_type == 'pdf':
+                    # For multi-page PDFs, take screenshots of each page
+                    page_count = await page.evaluate("""
+                        async () => {
+                            const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                            const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                            if (!container || pages.length === 0) return 0;
+                            // Scroll through to make sure all pages are loaded
+                            const scrollHeight = container.scrollHeight;
+                            const viewportHeight = container.clientHeight;
+                            const scrollStep = viewportHeight;
+                            for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
+                                container.scrollTo(0, scrollPos);
+                                await delay(300);
+                            }
+                            // Scroll back to top
+                            container.scrollTo(0, 0);
+                            await delay(300);
+                            return pages.length;
+                        }
+                    """)
+                    logger.info(f"Found {page_count} pages in document")
+                    # Take screenshots of each page
+                    screenshots = []
+                    for i in range(page_count):
+                        # Scroll to page
+                        await page.evaluate(f"""
+                            async () => {{
+                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                                if (pages.length <= {i}) return false;
+                                pages[{i}].scrollIntoView();
+                                await delay(500);
+                                return true;
+                            }}
+                        """)
+                        # Take screenshot
+                        screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
+                        await page.screenshot(path=screenshot_path, clip={
+                            'x': 0,
+                            'y': 0,
+                            'width': 1600,
+                            'height': 1200
+                        })
+                        screenshots.append(screenshot_path)
+                    # Combine screenshots into PDF
+                    from PIL import Image
+                    from reportlab.pdfgen import canvas
+                    c = canvas.Canvas(save_path)
+                    for screenshot in screenshots:
+                        img = Image.open(screenshot)
+                        width, height = img.size
+                        # Add page to PDF
+                        c.setPageSize((width, height))
+                        c.drawImage(screenshot, 0, 0, width, height)
+                        c.showPage()
+                    c.save()
+                    # Clean up screenshots
+                    for screenshot in screenshots:
+                        os.remove(screenshot)
+                    os.rmdir(screenshots_dir)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                else:
+                    # For single-page or non-PDF files, just take one screenshot
+                    screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
+                    await page.screenshot(path=screenshot_path, fullPage=True)
+                    # Convert to requested format if needed
+                    if file_type == 'pdf':
+                        from PIL import Image
+                        from reportlab.pdfgen import canvas
+                        # Create PDF from screenshot
+                        img = Image.open(screenshot_path)
+                        width, height = img.size
+                        c = canvas.Canvas(save_path, pagesize=(width, height))
+                        c.drawImage(screenshot_path, 0, 0, width, height)
+                        c.save()
+                    else:
+                        # Just copy the screenshot to the destination with proper extension
+                        shutil.copy(screenshot_path, save_path)
+                    # Clean up
+                    os.remove(screenshot_path)
+                    os.rmdir(screenshots_dir)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+        except Exception as e:
+            logger.error(f"Error taking screenshots: {e}")
+            return False
+    async def export_google_doc(self, file_id, file_type, save_path):
+        """Export Google Docs/Sheets/Slides to downloadable formats"""
+        try:
+            # Map file types to export formats
+            export_formats = {
+                'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # docx
+                'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',  # xlsx
+                'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # pptx
+                'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'pdf': 'application/pdf',
+            }
+            export_format = export_formats.get(file_type, 'application/pdf')
+            export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
+            if 'sheet' in file_type or 'xlsx' in file_type:
+                export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
+            elif 'ppt' in file_type or 'presentation' in file_type:
+                export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
+            elif file_type == 'pdf':
+                export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
+            async with self.context.new_page() as page:
+                # Get cookies from the main view page first
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
+                # Now try the export
+                response = await page.goto(export_url, wait_until='networkidle')
+                if response.status == 200:
+                    content = await response.body()
+                    with open(save_path, 'wb') as f:
+                        f.write(content)
+                    return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                else:
+                    logger.warning(f"Export failed with status {response.status}")
+                    return False
+        except Exception as e:
+            logger.error(f"Error exporting Google Doc: {e}")
+            return False
+    async def get_google_drive_file_info(self, file_id):
+        """Get file type and view-only status from Google Drive"""
+        file_type = None
+        is_view_only = False
+        try:
+            async with self.context.new_page() as page:
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+                # Check if view-only
+                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
+                is_view_only = view_only_text is not None
+                # Check for Google Docs viewer
+                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
+                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
+                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
+                if gdocs_viewer:
+                    file_type = 'docx'
+                elif gsheets_viewer:
+                    file_type = 'xlsx'
+                elif gslides_viewer:
+                    file_type = 'pptx'
+                else:
+                    # Check for PDF viewer
+                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
+                    if pdf_viewer:
+                        file_type = 'pdf'
+                    else:
+                        # Check for image viewer
+                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
+                        if img_viewer:
+                            # Get image type from src
+                            img_src = await img_viewer.get_attribute('src')
+                            if 'jpg' in img_src or 'jpeg' in img_src:
+                                file_type = 'jpg'
+                            elif 'png' in img_src:
+                                file_type = 'png'
+                            else:
+                                file_type = 'jpg'  # Default to jpg
+                        else:
+                            # Generic file type fallback
+                            file_type = 'pdf'  # Default to PDF
+                # If still no type, check filename
+                if not file_type:
+                    title_element = await page.query_selector('div[role="heading"]')
+                    if title_element:
+                        title = await title_element.text_content()
+                        if title:
+                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
+                            if ext_match:
+                                file_type = ext_match.group(1).lower()
+        except Exception as e:
+            logger.error(f"Error getting Google Drive file info: {e}")
+            file_type = 'pdf'  # Default to PDF if we can't determine
+        return file_type, is_view_only
+    async def get_sublinks(self, url, limit=10000):
+        """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
+        links = set()
+        try:
+            logger.info(f"Fetching sublinks from: {url}")
+            # Go to page and wait for full load
+            await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            # Get base URL for resolving relative links
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            path_base = os.path.dirname(parsed_base.path)
+            # Check if page has ASP.NET elements which might need special handling
+            is_aspnet = await self.page.evaluate('''
+                () => {
+                    return document.querySelector('form#aspnetForm') !== null ||
+                           document.querySelector('input[name="__VIEWSTATE"]') !== null;
+                }
+            ''')
+            if is_aspnet:
+                logger.info("Detected ASP.NET page, using enhanced extraction method")
+                # Try to interact with ASP.NET controls that might reveal more links
+                # Look for dropdowns, buttons, and grid elements
+                dropdowns = await self.page.query_selector_all('select')
+                buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
+                # Try interacting with dropdowns first
+                for dropdown in dropdowns:
+                    try:
+                        # Get all options
+                        options = await self.page.evaluate('''
+                            (dropdown) => {
+                                return Array.from(dropdown.options).map(o => o.value);
+                            }
+                        ''', dropdown)
+                        # Try selecting each option
+                        for option in options:
+                            if option:
+                                await dropdown.select_option(value=option)
+                                await self.page.wait_for_timeout(1000)
+                                await self.page.wait_for_load_state('networkidle', timeout=5000)
+                                # Extract any new links that appeared
+                                await self.extract_all_link_types(links, base_url, path_base)
+                    except Exception as e:
+                        logger.warning(f"Error interacting with dropdown: {e}")
+                # Try clicking buttons (but avoid dangerous ones like "delete")
+                safe_buttons = []
+                for button in buttons:
+                    button_text = await button.text_content() or ""
+                    button_value = await button.get_attribute("value") or ""
+                    button_id = await button.get_attribute("id") or ""
+                    combined_text = (button_text + button_value + button_id).lower()
+                    # Skip potentially destructive buttons
+                    if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
+                        continue
+                    # Prioritize buttons that might show more content
+                    if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
+                        safe_buttons.append(button)
+                # Click the safe buttons
+                for button in safe_buttons[:5]:  # Limit to first 5 to avoid too many clicks
+                    try:
+                        await button.click()
+                        await self.page.wait_for_timeout(1000)
+                        await self.page.wait_for_load_state('networkidle', timeout=5000)
+                        # Extract any new links that appeared
+                        await self.extract_all_link_types(links, base_url, path_base)
+                    except Exception as e:
+                        logger.warning(f"Error clicking button: {e}")
+            # Extract links from the initial page state
+            await self.extract_all_link_types(links, base_url, path_base)
+            # Look specifically for links inside grid/table views which are common in ASP.NET applications
+            grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
+            for cell in grid_cells:
+                try:
+                    href = await cell.get_attribute('href')
+                    if href:
+                        full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                        links.add(full_url)
+                except Exception as e:
+                    logger.warning(f"Error extracting grid link: {e}")
+            # Extract links from onclick attributes and javascript:__doPostBack calls
+            postback_links = await self.page.evaluate('''
+                () => {
+                    const results = [];
+                    // Find elements with onclick containing __doPostBack
+                    const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
+                    for (const el of elements) {
+                        // Extract the postback target
+                        const onclick = el.getAttribute('onclick') || '';
+                        const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
+                        if (match && match[1]) {
+                            // Get the visible text to use as description
+                            const text = el.innerText || el.textContent || 'Link';
+                            results.push({
+                                id: match[1],
+                                text: text.trim()
+                            });
+                        }
+                    }
+                    return results;
+                }
+            ''')
+            # Try interacting with some of the postback links
+            for postback in postback_links[:10]:  # Limit to first 10 to avoid too many interactions
+                try:
+                    logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
+                    await self.page.evaluate(f'''
+                        () => {{
+                            if (typeof __doPostBack === 'function') {{
+                                __doPostBack('{postback["id"]}', '');
+                            }}
+                        }}
+                    ''')
+                    await self.page.wait_for_timeout(1500)
+                    await self.page.wait_for_load_state('networkidle', timeout=5000)
+                    # Extract any new links that appeared
+                    await self.extract_all_link_types(links, base_url, path_base)
+                except Exception as e:
+                    logger.warning(f"Error with postback: {e}")
+            logger.info(f"Found {len(links)} sublinks")
+            return list(links)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks from {url}: {e}")
+            return list(links)[:limit]  # Return what we have so far
+    async def extract_all_link_types(self, links_set, base_url, path_base):
+        """Extract all types of links from the current page"""
+        # Get all <a> tag links
+        a_links = await self.page.query_selector_all('a[href]')
+        for a in a_links:
+            try:
+                href = await a.get_attribute('href')
+                if href and not href.startswith('javascript:') and not href.startswith('#'):
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+        # Get iframe sources
+        iframes = await self.page.query_selector_all('iframe[src]')
+        for iframe in iframes:
+            try:
+                src = await iframe.get_attribute('src')
+                if src and not src.startswith('javascript:') and not src.startswith('about:'):
+                    full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+        # Get links from onclick attributes that reference URLs
+        onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
+        for el in onclick_elements:
+            try:
+                onclick = await el.get_attribute('onclick')
+                urls = re.findall(r'(https?://[^\'"]+)', onclick)
+                for url in urls:
+                    links_set.add(url)
+            except Exception:
+                pass
+        # Look for URLs in data-* attributes
+        data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
+        for el in data_elements:
+            for attr in ['data-url', 'data-href', 'data-src']:
+                try:
+                    value = await el.get_attribute(attr)
+                    if value and not value.startswith('javascript:'):
+                        full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
+                        links_set.add(full_url)
+                except Exception:
+                    pass
+        # Look for special anchor links that might not have href attributes
+        special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
+        for anchor in special_anchors:
+            try:
+                href = await anchor.get_attribute('href')
+                if href and not href.startswith('javascript:') and not href.startswith('#'):
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+    def resolve_relative_url(self, relative_url, base_url, path_base):
+        """Properly resolve relative URLs considering multiple formats"""
+        if relative_url.startswith('/'):
+            # Absolute path relative to domain
+            return f"{base_url}{relative_url}"
+        elif relative_url.startswith('./'):
+            # Explicit relative path
+            return f"{base_url}{path_base}/{relative_url[2:]}"
+        elif relative_url.startswith('../'):
+            # Parent directory
+            parent_path = '/'.join(path_base.split('/')[:-1])
+            return f"{base_url}{parent_path}/{relative_url[3:]}"
+        else:
+            # Regular relative path
+            return f"{base_url}{path_base}/{relative_url}"
+    async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
+        if not custom_ext_list:
+            custom_ext_list = []
+        progress_text = st.empty()
+        progress_bar = st.progress(0)
+        file_count_text = st.empty()
+        try:
+            progress_text.text("Analyzing main page...")
+            # Special handling for ASP.NET pages
+            is_aspnet = False
+            try:
+                await self.page.goto(url, timeout=30000, wait_until='networkidle')
+                is_aspnet = await self.page.evaluate('''
+                    () => {
+                        return document.querySelector('form#aspnetForm') !== null ||
+                               document.querySelector('input[name="__VIEWSTATE"]') !== null;
+                    }
+                ''')
+            except Exception:
+                pass
+            # Extract files from main page
+            main_files = await self.extract_downloadable_files(url, custom_ext_list)
+            initial_count = len(main_files)
+            file_count_text.text(f"Found {initial_count} files on main page")
+            # Get sublinks with enhanced method
+            progress_text.text("Getting sublinks...")
+            sublinks = await self.get_sublinks(url, sublink_limit)
+            total_links = len(sublinks)
+            progress_text.text(f"Found {total_links} sublinks to process")
+            if not sublinks:
+                progress_bar.progress(1.0)
+                return main_files
+            # Process each sublink
+            all_files = main_files
+            for i, sublink in enumerate(sublinks, 1):
+                progress = i / total_links
+                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+                progress_bar.progress(progress)
+                try:
+                    # Use a longer timeout for ASP.NET pages which can be slower
+                    sub_timeout = timeout * 2 if is_aspnet else timeout
+                    # Extract files from sublink with appropriate timeout
+                    async with async_timeout(sub_timeout):
+                        sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+                        all_files.extend(sub_files)
+                        file_count_text.text(f"Found {len(all_files)} total files")
+                except Exception as e:
+                    logger.warning(f"Error processing sublink {sublink}: {e}")
+            # Deduplicate files
+            seen_urls = set()
+            unique_files = []
+            for f in all_files:
+                if f['url'] not in seen_urls:
+                    seen_urls.add(f['url'])
+                    unique_files.append(f)
+            final_count = len(unique_files)
+            progress_text.text(f"Deep search complete!")
+            file_count_text.text(f"Found {final_count} unique files")
+            progress_bar.progress(1.0)
+            return unique_files
+        except Exception as e:
+            logger.error(f"Deep search error: {e}")
+            progress_text.text(f"Error during deep search: {str(e)}")
+            return []
+        finally:
+            await asyncio.sleep(2)
+            if not st.session_state.get('keep_progress', False):
+                progress_text.empty()
+                progress_bar.empty()
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)