Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 8

Commit

52f13e6

verified ·

1 Parent(s): baece32

Update app.py

Browse files

Files changed (1) hide show

app.py +1212 -129

app.py CHANGED Viewed

@@ -22,6 +22,8 @@ import datetime
 import traceback
 import base64
 import shutil
 from PIL import Image
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
@@ -50,14 +52,44 @@ GOOGLE_OAUTH_CONFIG = {
     }
 }
 # -------------------- Utility Functions --------------------
 def get_random_user_agent():
-    USER_AGENTS = [
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
-        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
-    ]
     return random.choice(USER_AGENTS)
 def sizeof_fmt(num, suffix='B'):
@@ -75,6 +107,42 @@ def create_zip_file(file_paths, output_dir):
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
 # -------------------- Google Drive Functions --------------------
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
@@ -153,7 +221,7 @@ def install_playwright_dependencies():
 # -------------------- Download Manager Class --------------------
 class DownloadManager:
-    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
         self.proxy = proxy
         self.query = query
@@ -162,30 +230,140 @@ class DownloadManager:
         self.browser = None
         self.context = None
         self.page = None
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         opts = {
             "headless": True,
-            "args": [
-                '--no-sandbox',
-                '--disable-setuid-sandbox',
-                '--disable-dev-shm-usage',
-                '--disable-gpu',
-                '--no-zygote',
-                '--single-process'
-            ]
         }
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**opts)
-        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
         self.page = await self.context.new_page()
         await self.page.set_extra_http_headers({
-            'Accept-Language': 'en-US,en;q=0.9',
             'Accept-Encoding': 'gzip, deflate, br',
-            'Referer': 'https://www.bing.com/'
         })
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
@@ -194,17 +372,140 @@ class DownloadManager:
         if self.playwright:
             await self.playwright.stop()
     async def search_bing(self):
         urls = []
         try:
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
@@ -212,6 +513,8 @@ class DownloadManager:
     async def get_file_size(self, url):
         try:
             async with self.context.new_page() as page:
                 response = await page.request.head(url, timeout=15000)
                 length = response.headers.get('Content-Length', None)
@@ -219,11 +522,14 @@ class DownloadManager:
                     return sizeof_fmt(int(length))
                 else:
                     return "Unknown Size"
-        except Exception:
             return "Unknown Size"
     async def get_pdf_metadata(self, url):
         try:
             async with self.context.new_page() as page:
                 resp = await page.request.get(url, timeout=15000)
                 if resp.ok:
@@ -237,11 +543,14 @@ class DownloadManager:
                     }
                 else:
                     return {}
-        except Exception:
             return {}
     async def extract_real_download_url(self, url):
         try:
             async with self.context.new_page() as page:
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
                 if response and response.headers.get('location'):
@@ -258,8 +567,15 @@ class DownloadManager:
             logger.info(f"Fetching exam links from {url}")
             links = set()
-            # Use requests for a faster initial scan
-            headers = {"User-Agent": get_random_user_agent()}
             try:
                 response = requests.get(url, headers=headers, timeout=30)
@@ -274,77 +590,195 @@ class DownloadManager:
                         href = a["href"]
                         full_url = urljoin(url, href)
-                        # Special patterns for exam sites
-                        for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
-                                        "/test/", "/download/", "/files/", "/assignments/",
-                                        "paper_", "question_", "exam_", "test_", "past_"]:
-                            if pattern in full_url.lower():
-                                links.add(full_url)
-                                break
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
-            # If we didn't find many links with direct approach, use Playwright for more thorough extraction
-            if len(links) < 5:
-                logger.info("Using browser for enhanced link extraction")
-                try:
-                    await self.page.goto(url, timeout=30000, wait_until='networkidle')
-                    # Extract all links with Playwright
-                    page_links = await self.page.evaluate("""
                         () => {
-                            const links = [];
                             const anchors = document.querySelectorAll('a[href]');
                             for (const a of anchors) {
                                 if (a.href) {
-                                    links.push({
                                         href: a.href,
-                                        text: a.innerText || a.textContent || ''
                                     });
                                 }
                             }
-                            return links;
                         }
                     """)
-                    # Process extracted links
-                    for link_info in page_links:
                         href = link_info.get('href', '')
                         text = link_info.get('text', '').lower()
-                        if href:
-                            # Check for exam-related patterns in URL or link text
-                            url_patterns = ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
-                                            "/test/", "/download/", "/files/", "/assignments/",
-                                            "paper_", "question_", "exam_", "test_", "past_"]
-                            text_patterns = ["exam", "paper", "test", "question", "past", "download"]
                             if any(pattern in href.lower() for pattern in url_patterns) or \
-                               any(pattern in text for pattern in text_patterns):
                                 links.add(href)
                     # Check for ASP.NET specific elements that might contain exam links
-                    grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
                     for grid in grid_elements:
                         grid_links = await grid.query_selector_all('a[href]')
                         for a in grid_links:
                             href = await a.get_attribute('href')
                             if href:
                                 full_url = href if href.startswith('http') else urljoin(url, href)
                                 links.add(full_url)
-                    # Try clicking any controls that might reveal more exam links
-                    buttons = await self.page.query_selector_all('input[type="button"], button')
-                    for button in buttons:
-                        button_text = await button.text_content() or ""
-                        button_value = await button.get_attribute("value") or ""
-                        if any(keyword in (button_text + button_value).lower() for keyword in
-                              ["show", "view", "display", "list", "exam", "paper", "test"]):
                             try:
                                 await button.click()
-                                await self.page.wait_for_timeout(1000)
-                                await self.page.wait_for_load_state('networkidle', timeout=5000)
                                 # Get any new links that appeared
                                 new_links = await self.page.query_selector_all('a[href]')
@@ -352,24 +786,67 @@ class DownloadManager:
                                     href = await a.get_attribute('href')
                                     if href:
                                         full_url = href if href.startswith('http') else urljoin(url, href)
-                                        links.add(full_url)
                             except Exception as e:
                                 logger.warning(f"Error clicking button: {e}")
                 except Exception as e:
-                    logger.error(f"Browser-based extraction failed: {e}")
             # Filter links to likely contain exam documents
             filtered_links = []
             for link in links:
                 # Common file extensions for exam documents
-                if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
                     filtered_links.append(link)
                     continue
                 # Common paths for exam documents
                 if any(pattern in link.lower() for pattern in [
                     "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
-                    "/pastpapers/", "/questionpapers/", "/tests/"
                 ]):
                     filtered_links.append(link)
@@ -383,6 +860,9 @@ class DownloadManager:
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
             # Special handling for educational exam sites
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
@@ -403,6 +883,12 @@ class DownloadManager:
                         except Exception:
                             pass
                     # Get file size
                     size_str = await self.get_file_size(real_url)
@@ -429,14 +915,55 @@ class DownloadManager:
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
             final_url = self.page.url
             if '.php' in final_url or 'download' in final_url:
                 real_url = await self.extract_real_download_url(final_url)
                 if real_url != final_url:
                     found_files.append({
                         'url': real_url,
-                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                         'size': await self.get_file_size(real_url),
                         'metadata': {}
                     })
@@ -549,15 +1076,118 @@ class DownloadManager:
                             'metadata': meta
                         })
             seen_urls = set()
             unique_files = []
             for f in found_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
     async def download_file(self, file_info, save_dir, referer):
@@ -596,23 +1226,85 @@ class DownloadManager:
                 logger.warning("All standard methods failed, attempting force download")
                 result_path = await self.force_download_viewonly(file_info, path)
                 return result_path if result_path else None
-            # Original code for non-Google Drive downloads
             async with self.context.new_page() as page:
                 headers = {
                     'Accept': '*/*',
                     'Accept-Encoding': 'gzip, deflate, br',
                     'Referer': referer
                 }
-                response = await page.request.get(file_url, headers=headers, timeout=30000)
-                if response.status == 200:
-                    content = await response.body()
-                    with open(path, 'wb') as f:
-                        f.write(content)
-                    return path
-                else:
-                    logger.error(f"Download failed with status {response.status}: {file_url}")
-                    return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
@@ -642,17 +1334,20 @@ class DownloadManager:
             logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
-            # Create a dedicated browser instance with better resolution
             browser = await self.playwright.chromium.launch(
                 headless=True,
-                args=[
-                    '--no-sandbox',
-                    '--disable-setuid-sandbox',
-                    '--disable-dev-shm-usage',
-                    '--disable-web-security',
-                    '--disable-features=IsolateOrigins,site-per-process',
-                    '--disable-site-isolation-trials'
-                ]
             )
             # Use higher resolution for better quality
@@ -663,6 +1358,34 @@ class DownloadManager:
                 accept_downloads=True  # Critical for the download workflow
             )
             page = await context.new_page()
             try:
@@ -670,7 +1393,14 @@ class DownloadManager:
                 logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
                 await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
                 await page.wait_for_load_state('networkidle')
-                await page.wait_for_timeout(5000)  # Wait longer for everything to load
                 # Create temp directory
                 temp_dir = tempfile.mkdtemp()
@@ -679,7 +1409,11 @@ class DownloadManager:
                 if file_type.lower() == 'pdf':
                     # Use the improved scrolling and detection approach
-                    # Check if there's a pagination control to estimate pages
                     estimated_pages = await page.evaluate("""
                     () => {
                         // Method 1: Check page counter text
@@ -709,14 +1443,13 @@ class DownloadManager:
                     logger.info(f"Estimated {estimated_pages} pages in PDF")
-                    # Scroll to ensure all pages are loaded
-                    logger.info("Scrolling to load all PDF pages...")
-                    # Initial scroll to bottom to trigger lazy loading
                     await page.keyboard.press("End")
                     await page.wait_for_timeout(3000)
                     # Scroll page by page to ensure all pages are loaded
                     max_attempts = min(estimated_pages * 3, 300)
                     attempt = 0
                     prev_blob_count = 0
@@ -734,8 +1467,19 @@ class DownloadManager:
                             logger.info("All pages appear to be loaded.")
                             break
-                        await page.keyboard.press("PageDown")
-                        await page.wait_for_timeout(2000)
                         prev_blob_count = blob_count
                         attempt += 1
@@ -801,6 +1545,72 @@ class DownloadManager:
                     if not result.get('success', False):
                         logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
                         return None
                     logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
@@ -902,6 +1712,37 @@ class DownloadManager:
         # Try standard approaches for non-view-only files
         try:
             # Try with requests and session cookies
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
@@ -944,37 +1785,111 @@ class DownloadManager:
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
-        logger.warning("Standard download methods failed")
         return False
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
         """Download view-only PDF using the enhanced blob image caching technique"""
         try:
-            # Create a dedicated browser instance
             browser = await self.playwright.chromium.launch(
                 headless=True,
-                args=[
-                    '--no-sandbox',
-                    '--disable-setuid-sandbox',
-                    '--disable-dev-shm-usage',
-                    '--disable-web-security'
-                ]
             )
             context = await browser.new_context(
                 viewport={'width': 1600, 'height': 1200},
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                accept_downloads=True  # Critical for handling the download event
             )
             page = await context.new_page()
             try:
-                # Step 1: Navigate to the file
                 logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
                 await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
                 await page.wait_for_load_state('networkidle')
-                await page.wait_for_timeout(5000)  # Initial wait for content to load
                 # Step 2: Estimate the number of pages
                 estimated_pages = await page.evaluate("""
@@ -1007,11 +1922,12 @@ class DownloadManager:
                 await page.keyboard.press("End")
                 await page.wait_for_timeout(3000)
-                # Step 4: Wait for all pages to load by pressing PageDown and checking blob images
-                logger.info("Waiting for all pages to load...")
-                max_attempts = min(estimated_pages * 3, 300)  # Adjust based on document size
                 attempt = 0
                 prev_blob_count = 0
                 while attempt < max_attempts:
                     # Count blob images (which are the PDF pages)
@@ -1023,14 +1939,40 @@ class DownloadManager:
                     logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
-                    # If we've loaded enough pages or reached estimated count
-                    if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
-                        logger.info("All pages appear to be loaded.")
                         break
-                    # Press PageDown to scroll further and trigger more loading
-                    await page.keyboard.press("PageDown")
-                    await page.wait_for_timeout(2000)  # Wait for content to load
                     prev_blob_count = blob_count
                     attempt += 1
@@ -1050,10 +1992,9 @@ class DownloadManager:
                                 try {
                                     let pdf = new jsPDF();
                                     let imgs = document.getElementsByTagName("img");
-                                    let added = 0;
-                                    // First collect and sort all valid blob images
                                     let validImages = [];
                                     for (let i = 0; i < imgs.length; i++) {
                                         let img = imgs[i];
                                         if (!/^blob:/.test(img.src)) continue;
@@ -1061,7 +2002,7 @@ class DownloadManager:
                                         validImages.push(img);
                                     }
-                                    // Sort by vertical position
                                     validImages.sort((a, b) => {
                                         const rectA = a.getBoundingClientRect();
                                         const rectB = b.getBoundingClientRect();
@@ -1070,6 +2011,7 @@ class DownloadManager:
                                     console.log(`Found ${validImages.length} valid page images to add to PDF`);
                                     // Process each image as a page
                                     for (let i = 0; i < validImages.length; i++) {
                                         let img = validImages[i];
@@ -1384,6 +2326,9 @@ class DownloadManager:
                     logger.info(f"Found {len(links)} sublinks with specialized method")
                     return list(links)[:limit]
             # Standard sublink extraction for all sites
             await self.page.goto(url, timeout=30000, wait_until='networkidle')
@@ -1392,6 +2337,23 @@ class DownloadManager:
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             path_base = os.path.dirname(parsed_base.path)
             # Check if page has ASP.NET elements which might need special handling
             is_aspnet = await self.page.evaluate('''
                 () => {
@@ -1514,6 +2476,60 @@ class DownloadManager:
                 except Exception as e:
                     logger.warning(f"Error with postback: {e}")
             logger.info(f"Found {len(links)} sublinks")
             return list(links)[:limit]
@@ -1578,6 +2594,19 @@ class DownloadManager:
                     links_set.add(full_url)
             except Exception:
                 pass
     def resolve_relative_url(self, relative_url, base_url, path_base):
         """Properly resolve relative URLs considering multiple formats"""
@@ -1628,12 +2657,14 @@ class DownloadManager:
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
-                return main_files
             # Process each sublink
-            all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
                 progress = i / total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
@@ -1703,6 +2734,7 @@ def main():
             sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
         with st.expander("Google Drive Integration", expanded=False):
             if st.button("Start Google Sign-In", key="google_signin_btn"):
@@ -1713,6 +2745,37 @@ def main():
                 creds, msg = exchange_code_for_credentials(auth_code)
                 st.session_state.google_creds = creds
                 st.write(msg)
     if mode == "Manual URL":
         st.header("Manual URL Mode")
@@ -1727,16 +2790,20 @@ def main():
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
                     @st.cache_resource
-                    def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
                         async def _run():
-                            async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
                                 files = await dm.deep_search(url, ext_list, max_links, timeout_val)
                                 return files
                         return asyncio.run(_run())
                     with st.spinner("Searching for files..."):
                         files = run_deep_search(url, valid_ext_list, max_sublinks,
-                                               sublink_timeout, use_proxy, proxy)
                     if files:
                         st.session_state.discovered_files = files
@@ -1799,7 +2866,11 @@ def main():
                         progress_bar = st.progress(0)
                         status_text = st.empty()
-                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
                             for i, idx in enumerate(selected_indices):
                                 progress = (i + 1) / len(selected_indices)
                                 file_info = files[idx]
@@ -1880,7 +2951,13 @@ def main():
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
-                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
                         with st.spinner("Searching..."):
                             urls = await dm.search_bing()
                             if urls:
@@ -1911,16 +2988,20 @@ def main():
                 valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
                 @st.cache_resource
-                def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
                     async def _run():
-                        async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
                             files = await dm.deep_search(url, ext_list, max_links, timeout_val)
                             return files
                     return asyncio.run(_run())
                 with st.spinner("Searching for files..."):
                     files = run_deep_search(url, valid_ext_list, max_sublinks,
-                                           sublink_timeout, use_proxy, proxy)
                 if files:
                     st.session_state.discovered_files = files
@@ -1944,7 +3025,7 @@ def main():
             with st.spinner("Downloading view-only document... (this may take a minute)"):
                 async def download_viewonly():
-                    async with DownloadManager() as dm:
                         file_info = {
                             'url': f"https://drive.google.com/file/d/{file_id}/view",
                             'filename': f"gdrive_{file_id}.pdf",
@@ -1957,13 +3038,15 @@ def main():
                 if result:
                     st.success("Document downloaded successfully!")
                     with open(result, "rb") as f:
                         file_bytes = f.read()
                     st.download_button(
                         label="Download PDF",
                         data=file_bytes,
-                        file_name=os.path.basename(result),
                         mime="application/pdf"
                     )
                 else:
@@ -1971,7 +3054,7 @@ def main():
     # Add footer with attribution
     st.markdown('---')
-    st.markdown('Created by [Euler314](https://github.com/yu314-coder)')
 if __name__ == "__main__":
     main()

 import traceback
 import base64
 import shutil
+import json
+import time
 from PIL import Image
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
     }
 }
+# -------------------- Stealth and UA Settings --------------------
+# Extended user agent list for better variety
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+    'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
+]
+# Stealth browser settings
+STEALTH_SETTINGS = {
+    # Hardware features to modify/disable
+    "hardware_concurrency": 4,
+    "device_memory": 8,
+    # Browser features to enable/disable
+    "webgl_vendor": "Google Inc. (Intel)",
+    "webgl_renderer": "Intel Iris OpenGL Engine",
+    "languages": ["en-US", "en"],
+    "disable_webrtc": True,
+    # Additional timing randomization
+    "navigator_platform": "Win32",
+    "touch_support": False
+}
+# Proxy rotation configuration (if using multiple proxies)
+PROXY_ROTATION_CONFIG = {
+    "enabled": False,  # Set to True to enable rotation
+    "rotation_interval": 10,  # Rotate every 10 requests
+    "proxies": []  # Will be populated from the UI if needed
+}
 # -------------------- Utility Functions --------------------
 def get_random_user_agent():
     return random.choice(USER_AGENTS)
 def sizeof_fmt(num, suffix='B'):
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
+def get_file_extension(url, default='.pdf'):
+    """Extract file extension from URL or filename"""
+    path = urlparse(url).path
+    ext = os.path.splitext(path)[1].lower()
+    if not ext:
+        return default
+    return ext
+def humanize_file_size(size_bytes):
+    """Format file size in human-readable format"""
+    if size_bytes < 1024:
+        return f"{size_bytes} bytes"
+    for unit in ['KB', 'MB', 'GB', 'TB']:
+        size_bytes /= 1024.0
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+    return f"{size_bytes:.1f} PB"
+def get_domain(url):
+    """Extract domain from URL"""
+    parsed = urlparse(url)
+    return parsed.netloc
+def is_valid_file_url(url, extensions):
+    """Check if URL is a valid file URL based on extension"""
+    return any(url.lower().endswith(ext) for ext in extensions)
+def detect_captcha(html_content):
+    """Detect common captcha patterns in HTML content"""
+    captcha_patterns = [
+        'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
+        'challenge', 'solve the following', 'verify you are human'
+    ]
+    html_lower = html_content.lower()
+    return any(pattern in html_lower for pattern in captcha_patterns)
 # -------------------- Google Drive Functions --------------------
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
 # -------------------- Download Manager Class --------------------
 class DownloadManager:
+    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False):
         self.use_proxy = use_proxy
         self.proxy = proxy
         self.query = query
         self.browser = None
         self.context = None
         self.page = None
+        self.use_stealth = use_stealth
+        self.proxy_rotation = proxy_rotation
+        self.request_count = 0
+        self.captcha_detected = False
+        self.download_timeout = 300  # 5 minutes timeout for downloads
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
+        # Prepare browser args with stealth settings
+        browser_args = [
+            '--no-sandbox',
+            '--disable-setuid-sandbox',
+            '--disable-dev-shm-usage',
+            '--disable-gpu',
+            '--no-zygote',
+            '--single-process',
+            '--disable-web-security',
+            '--disable-features=IsolateOrigins',
+            '--disable-site-isolation-trials'
+        ]
+        # Add stealth-specific args
+        if self.use_stealth:
+            browser_args.extend([
+                '--disable-blink-features=AutomationControlled',
+                '--disable-features=IsolateOrigins,site-per-process',
+                '--disable-webgl',
+                '--disable-webrtc'
+            ])
+        # Setup browser options
         opts = {
             "headless": True,
+            "args": browser_args
         }
+        # Configure proxy if specified
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
+        # Launch browser with options
         self.browser = await self.playwright.chromium.launch(**opts)
+        # Setup browser context with enhanced settings
+        context_opts = {
+            "user_agent": get_random_user_agent(),
+            "viewport": {"width": 1920, "height": 1080},
+            "device_scale_factor": 1,
+            "has_touch": False,
+            "is_mobile": False,
+            "ignore_https_errors": True,
+            "accept_downloads": True
+        }
+        # Apply stealth-specific settings to the context
+        if self.use_stealth:
+            # Apply JS-injection for enhanced stealth
+            context_opts["bypass_csp"] = True
+            self.context = await self.browser.new_context(**context_opts)
+            # Execute stealth JS to avoid detection
+            await self.context.add_init_script("""
+            () => {
+                Object.defineProperty(navigator, 'webdriver', {
+                    get: () => false,
+                });
+                // Change navigator properties
+                const newProto = navigator.__proto__;
+                delete newProto.webdriver;
+                // Overwrite the plugins
+                Object.defineProperty(navigator, 'plugins', {
+                    get: () => [1, 2, 3, 4, 5].map(() => ({
+                        lengthComputable: true,
+                        loaded: 100,
+                        total: 100
+                    }))
+                });
+                // Handle languages more naturally
+                Object.defineProperty(navigator, 'languages', {
+                    get: () => ['en-US', 'en', 'es']
+                });
+                // Modify hardware concurrency
+                Object.defineProperty(navigator, 'hardwareConcurrency', {
+                    get: () => 4
+                });
+                // Modify deviceMemory
+                Object.defineProperty(navigator, 'deviceMemory', {
+                    get: () => 8
+                });
+                // WebGL modifications
+                const getParameter = WebGLRenderingContext.prototype.getParameter;
+                WebGLRenderingContext.prototype.getParameter = function(parameter) {
+                    if (parameter === 37445) {
+                        return 'Intel Inc.';
+                    }
+                    if (parameter === 37446) {
+                        return 'Intel Iris OpenGL Engine';
+                    }
+                    return getParameter.apply(this, arguments);
+                };
+            }
+            """)
+        else:
+            # Regular context without stealth
+            self.context = await self.browser.new_context(**context_opts)
+        # Create page with enhanced headers
         self.page = await self.context.new_page()
         await self.page.set_extra_http_headers({
+            'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
             'Accept-Encoding': 'gzip, deflate, br',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+            'Cache-Control': 'max-age=0',
+            'DNT': '1',  # Do Not Track
+            'Referer': 'https://www.google.com/',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'cross-site',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1'
         })
+        # Add delay for mouse movements to simulate human behavior
+        if self.use_stealth:
+            await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500))
+            await self.page.wait_for_timeout(random.randint(200, 500))
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.playwright:
             await self.playwright.stop()
+    async def rotate_proxy_if_needed(self):
+        """Rotate proxy if proxy rotation is enabled and threshold is reached"""
+        if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]:
+            self.request_count += 1
+            if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]:
+                # Get next proxy from the pool
+                next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0)
+                PROXY_ROTATION_CONFIG["proxies"].append(next_proxy)  # Move to end of list
+                # Close existing context and create new one with the new proxy
+                if self.context:
+                    await self.context.close()
+                # Create new context with the new proxy
+                context_opts = {
+                    "user_agent": get_random_user_agent(),
+                    "proxy": {"server": next_proxy},
+                    "accept_downloads": True
+                }
+                self.context = await self.browser.new_context(**context_opts)
+                self.page = await self.context.new_page()
+                # Reset counter
+                self.request_count = 0
+                logger.info(f"Rotated to new proxy: {next_proxy}")
+    async def handle_captcha(self, page):
+        """Detect and handle captchas if possible"""
+        # Check for common captcha patterns
+        content = await page.content()
+        if detect_captcha(content):
+            self.captcha_detected = True
+            logger.warning("Captcha detected on page")
+            # Strategies for handling captchas:
+            # 1. For simple captchas, try to extract the image and solve it
+            captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]')
+            if captcha_img:
+                logger.info("Found captcha image, attempting to capture")
+                # Take screenshot of the captcha
+                captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png")
+                await captcha_img.screenshot(path=captcha_path)
+                # In a real implementation, you would send this to a captcha solving service
+                # For now, just log the detection
+                logger.info(f"Captcha image saved to {captcha_path}")
+                # For demonstration, we'll notify the user but not actually solve it
+                return False
+            # 2. For reCAPTCHA, special handling would be required
+            recaptcha = await page.query_selector('iframe[src*="recaptcha"]')
+            if recaptcha:
+                logger.warning("reCAPTCHA detected, would require external solving service")
+                return False
+            # 3. Try to perform human-like actions that might bypass simple bot checks
+            await self.perform_human_actions(page)
+            # Check if captcha is still present
+            content = await page.content()
+            if detect_captcha(content):
+                logger.warning("Captcha still present after human-like actions")
+                return False
+            else:
+                logger.info("Captcha appears to be resolved")
+                return True
+        return True  # No captcha detected
+    async def perform_human_actions(self, page):
+        """Perform human-like actions on the page to possibly bypass simple bot checks"""
+        try:
+            # 1. Slowly scroll down the page
+            for i in range(3):
+                await page.evaluate(f"window.scrollTo(0, {i * 300})")
+                await page.wait_for_timeout(random.randint(300, 700))
+            # 2. Random mouse movements
+            for _ in range(3):
+                x = random.randint(100, 800)
+                y = random.randint(100, 600)
+                await page.mouse.move(x=x, y=y)
+                await page.wait_for_timeout(random.randint(200, 500))
+            # 3. Click on a non-essential part of the page
+            try:
+                await page.click("body", position={"x": 50, "y": 50})
+            except:
+                pass
+            # 4. Wait a bit before continuing
+            await page.wait_for_timeout(1000)
+        except Exception as e:
+            logger.warning(f"Error during human-like actions: {e}")
     async def search_bing(self):
         urls = []
         try:
+            # Rotate proxy if needed
+            await self.rotate_proxy_if_needed()
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
+            # Check for captchas
+            if not await self.handle_captcha(self.page):
+                logger.warning("Captcha detected during search, results may be limited")
+            # More natural scrolling behavior
+            for i in range(3):
+                await self.page.evaluate(f"window.scrollTo(0, {i * 400})")
+                await self.page.wait_for_timeout(random.randint(300, 800))
+            # Extract search results
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
+            # If we didn't find enough results, try an alternative selector
+            if len(urls) < self.num_results:
+                alt_links = await self.page.query_selector_all(".b_caption a")
+                for link in alt_links:
+                    href = await link.get_attribute('href')
+                    if href and href not in urls:
+                        urls.append(href)
+                        if len(urls) >= self.num_results:
+                            break
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
     async def get_file_size(self, url):
         try:
+            await self.rotate_proxy_if_needed()
             async with self.context.new_page() as page:
                 response = await page.request.head(url, timeout=15000)
                 length = response.headers.get('Content-Length', None)
                     return sizeof_fmt(int(length))
                 else:
                     return "Unknown Size"
+        except Exception as e:
+            logger.warning(f"Error getting file size: {e}")
             return "Unknown Size"
     async def get_pdf_metadata(self, url):
         try:
+            await self.rotate_proxy_if_needed()
             async with self.context.new_page() as page:
                 resp = await page.request.get(url, timeout=15000)
                 if resp.ok:
                     }
                 else:
                     return {}
+        except Exception as e:
+            logger.warning(f"Error reading PDF metadata: {e}")
             return {}
     async def extract_real_download_url(self, url):
         try:
+            await self.rotate_proxy_if_needed()
             async with self.context.new_page() as page:
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
                 if response and response.headers.get('location'):
             logger.info(f"Fetching exam links from {url}")
             links = set()
+            # First try with direct requests for speed (but with proper headers)
+            headers = {
+                "User-Agent": get_random_user_agent(),
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9",
+                "Referer": "https://www.google.com/",
+                "DNT": "1"
+            }
             try:
                 response = requests.get(url, headers=headers, timeout=30)
                         href = a["href"]
                         full_url = urljoin(url, href)
+                        # Look for text clues
+                        link_text = a.get_text().lower()
+                        # Special patterns for exam sites (expanded list)
+                        url_patterns = [
+                            "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
+                            "/test/", "/download/", "/files/", "/assignments/",
+                            "paper_", "question_", "exam_", "test_", "past_",
+                            "assignment_", "sample_", "study_material", "notes_",
+                            "/resource/", "/subject/", "/course/", "/material/"
+                        ]
+                        text_patterns = [
+                            "exam", "paper", "test", "question", "past", "download",
+                            "assignment", "sample", "study", "material", "notes",
+                            "subject", "course", "resource", "pdf", "document",
+                            "view", "open", "get", "solution", "answer"
+                        ]
+                        # Check URL for patterns
+                        if any(pattern in full_url.lower() for pattern in url_patterns):
+                            links.add(full_url)
+                            continue
+                        # Check link text for patterns
+                        if any(pattern in link_text for pattern in text_patterns):
+                            links.add(full_url)
+                            continue
+                        # Check for common file extensions
+                        if any(full_url.lower().endswith(ext) for ext in
+                              ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+                            links.add(full_url)
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
+            # Browser-based approach for more thorough extraction or if initial approach was inadequate
+            try:
+                # Check if we need to proceed with browser-based extraction
+                if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
+                    logger.info("Using browser for enhanced link extraction")
+                    # Rotate proxy if needed
+                    await self.rotate_proxy_if_needed()
+                    # Navigate to the page with more natural timing
+                    await self.page.goto(url, timeout=45000, wait_until='networkidle')
+                    await self.page.wait_for_timeout(random.randint(1000, 2000))
+                    # Handle captchas if present
+                    if not await self.handle_captcha(self.page):
+                        logger.warning("Captcha detected, extraction may be limited")
+                    # Get base URL for resolving relative links
+                    parsed_base = urlparse(url)
+                    base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+                    # Perform natural scrolling to trigger lazy-loaded content
+                    page_height = await self.page.evaluate("document.body.scrollHeight")
+                    viewport_height = await self.page.evaluate("window.innerHeight")
+                    for scroll_pos in range(0, page_height, viewport_height // 2):
+                        await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})")
+                        await self.page.wait_for_timeout(random.randint(300, 800))
+                    # Scroll back to top
+                    await self.page.evaluate("window.scrollTo(0, 0)")
+                    await self.page.wait_for_timeout(500)
+                    # Extract all links with Playwright (better than just anchor tags)
+                    all_links = await self.page.evaluate("""
                         () => {
+                            const results = [];
+                            // Get all anchor tags
                             const anchors = document.querySelectorAll('a[href]');
                             for (const a of anchors) {
                                 if (a.href) {
+                                    results.push({
                                         href: a.href,
+                                        text: a.innerText || a.textContent || '',
+                                        isButton: a.classList.contains('btn') || a.role === 'button'
                                     });
                                 }
                             }
+                            // Get buttons that might contain links
+                            const buttons = document.querySelectorAll('button');
+                            for (const btn of buttons) {
+                                const onclick = btn.getAttribute('onclick') || '';
+                                if (onclick.includes('window.location') || onclick.includes('download')) {
+                                    results.push({
+                                        href: '#button',
+                                        text: btn.innerText || btn.textContent || '',
+                                        isButton: true,
+                                        onclick: onclick
+                                    });
+                                }
+                            }
+                            return results;
                         }
                     """)
+                    # Process the extracted links
+                    for link_info in all_links:
                         href = link_info.get('href', '')
                         text = link_info.get('text', '').lower()
+                        if href and href != '#button':
+                            # Check URL patterns
+                            url_patterns = [
+                                "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
+                                "/test/", "/download/", "/files/", "/assignments/",
+                                "paper_", "question_", "exam_", "test_", "past_",
+                                "assignment_", "sample_", "study_material", "notes_"
+                            ]
+                            # Check text patterns
+                            text_patterns = [
+                                "exam", "paper", "test", "question", "past", "download",
+                                "assignment", "sample", "study", "material", "notes",
+                                "pdf", "document", "view", "open", "solution"
+                            ]
                             if any(pattern in href.lower() for pattern in url_patterns) or \
+                               any(pattern in text for pattern in text_patterns) or \
+                               any(href.lower().endswith(ext) for ext in
+                                  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                 links.add(href)
                     # Check for ASP.NET specific elements that might contain exam links
+                    grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
                     for grid in grid_elements:
                         grid_links = await grid.query_selector_all('a[href]')
                         for a in grid_links:
                             href = await a.get_attribute('href')
+                            text = await a.text_content()
                             if href:
                                 full_url = href if href.startswith('http') else urljoin(url, href)
                                 links.add(full_url)
+                    # Try clicking pagination controls to reveal more content
+                    pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a')
+                    for i, button in enumerate(pagination_buttons[:5]):  # Limit to first 5 pagination buttons
+                        try:
+                            # Check if this is a numeric pagination button (more likely to be useful)
+                            button_text = await button.text_content()
+                            if button_text and button_text.strip().isdigit():
+                                logger.info(f"Clicking pagination button: {button_text}")
+                                await button.click()
+                                await self.page.wait_for_timeout(2000)
+                                await self.page.wait_for_load_state('networkidle', timeout=10000)
+                                # Extract links from this page
+                                new_page_links = await self.page.evaluate("""
+                                    () => {
+                                        return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
+                                    }
+                                """)
+                                for href in new_page_links:
+                                    if href and not href.startswith('javascript:'):
+                                        if any(pattern in href.lower() for pattern in url_patterns) or \
+                                           any(href.lower().endswith(ext) for ext in
+                                              ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+                                            links.add(href)
+                        except Exception as e:
+                            logger.warning(f"Error clicking pagination button: {e}")
+                    # Try clicking any controls that might reveal more exam links (more focused approach)
+                    show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn')
+                    for button in show_buttons:
+                        button_text = (await button.text_content() or "").lower()
+                        button_value = (await button.get_attribute("value") or "").lower()
+                        button_id = (await button.get_attribute("id") or "").lower()
+                        # Look for buttons that seem likely to reveal file lists
+                        promising_terms = ["show", "view", "display", "list", "exam", "paper", "test",
+                                         "download", "resource", "material", "browse", "file"]
+                        if any(term in button_text or term in button_value or term in button_id
+                               for term in promising_terms):
                             try:
+                                logger.info(f"Clicking button: {button_text or button_value}")
                                 await button.click()
+                                await self.page.wait_for_timeout(2000)
+                                await self.page.wait_for_load_state('networkidle', timeout=10000)
                                 # Get any new links that appeared
                                 new_links = await self.page.query_selector_all('a[href]')
                                     href = await a.get_attribute('href')
                                     if href:
                                         full_url = href if href.startswith('http') else urljoin(url, href)
+                                        # Focus on file extensions and patterns
+                                        if any(full_url.lower().endswith(ext) for ext in
+                                               ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \
+                                           any(pattern in full_url.lower() for pattern in url_patterns):
+                                            links.add(full_url)
                             except Exception as e:
                                 logger.warning(f"Error clicking button: {e}")
+                # Special handling for ASP.NET PostBack links
+                try:
+                    # Find and interact with ASP.NET __doPostBack elements
+                    postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]')
+                    for i, element in enumerate(postback_elements[:10]):  # Limit to avoid too many clicks
+                        try:
+                            onclick = await element.get_attribute('onclick')
+                            if onclick and '__doPostBack' in onclick:
+                                element_text = await element.text_content()
+                                # Only interact with elements that seem likely to contain exam links
+                                promising_terms = ["show", "view", "list", "exam", "paper", "test",
+                                                "download", "resource", "material"]
+                                if any(term in element_text.lower() for term in promising_terms):
+                                    logger.info(f"Clicking ASP.NET postback element: {element_text}")
+                                    # Click the element
+                                    await element.click()
+                                    await self.page.wait_for_timeout(2000)
+                                    await self.page.wait_for_load_state('networkidle', timeout=10000)
+                                    # Extract any new links
+                                    new_links = await self.page.query_selector_all('a[href]')
+                                    for a in new_links:
+                                        href = await a.get_attribute('href')
+                                        if href:
+                                            full_url = href if href.startswith('http') else urljoin(url, href)
+                                            if any(full_url.lower().endswith(ext) for ext in
+                                                ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+                                                links.add(full_url)
+                        except Exception as e:
+                            logger.warning(f"Error interacting with postback element: {e}")
                 except Exception as e:
+                    logger.warning(f"Error during postback handling: {e}")
+            except Exception as e:
+                logger.error(f"Browser-based extraction failed: {e}")
             # Filter links to likely contain exam documents
             filtered_links = []
             for link in links:
                 # Common file extensions for exam documents
+                if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                     filtered_links.append(link)
                     continue
                 # Common paths for exam documents
                 if any(pattern in link.lower() for pattern in [
                     "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
+                    "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
+                    "/resource/", "/material/", "/notes/", "/subjectmaterial/"
                 ]):
                     filtered_links.append(link)
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
+            # Rotate proxy if needed
+            await self.rotate_proxy_if_needed()
             # Special handling for educational exam sites
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
                         except Exception:
                             pass
+                    # If filename is empty or invalid, create a sensible one
+                    if not filename or filename == '/':
+                        domain = get_domain(real_url)
+                        ext = get_file_extension(real_url, '.pdf')
+                        filename = f"file_from_{domain}{ext}"
                     # Get file size
                     size_str = await self.get_file_size(real_url)
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
+            # Check for captchas
+            if not await self.handle_captcha(self.page):
+                logger.warning("Captcha detected, file extraction may be limited")
+            # Scroll through the page naturally to trigger lazy loading
+            await self.page.evaluate("""
+                (async () => {
+                    const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
+                    const height = document.body.scrollHeight;
+                    const scrollStep = Math.floor(window.innerHeight / 2);
+                    for (let i = 0; i < height; i += scrollStep) {
+                        window.scrollTo(0, i);
+                        await delay(100);
+                    }
+                    window.scrollTo(0, 0);
+                })()
+            """)
+            await self.page.wait_for_timeout(1000)
             final_url = self.page.url
             if '.php' in final_url or 'download' in final_url:
                 real_url = await self.extract_real_download_url(final_url)
                 if real_url != final_url:
+                    # Try to detect the filename from headers or URL
+                    response = await self.page.request.head(real_url, timeout=15000)
+                    filename = None
+                    # Try to get from Content-Disposition header
+                    content_disposition = response.headers.get('Content-Disposition', '')
+                    if 'filename=' in content_disposition:
+                        filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition)
+                        if filename_match:
+                            filename = filename_match.group(1)
+                    # If not found in headers, use URL basename
+                    if not filename:
+                        filename = os.path.basename(urlparse(real_url).path)
+                        if not filename or filename == '/':
+                            # Generate a name based on domain
+                            domain = get_domain(real_url)
+                            ext = get_file_extension(real_url, '.pdf')
+                            filename = f"file_from_{domain}{ext}"
                     found_files.append({
                         'url': real_url,
+                        'filename': filename,
                         'size': await self.get_file_size(real_url),
                         'metadata': {}
                     })
                             'metadata': meta
                         })
+            # Also check for data-src and data-url attributes (common in lazy-loaded sites)
+            data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]')
+            for elem in data_elements:
+                for attr in ['data-src', 'data-url', 'data-href', 'data-download']:
+                    try:
+                        value = await elem.get_attribute(attr)
+                        if value and any(value.lower().endswith(ext) for ext in all_exts):
+                            file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
+                            found_files.append({
+                                'url': file_url,
+                                'filename': os.path.basename(file_url.split('?')[0]),
+                                'size': await self.get_file_size(file_url),
+                                'metadata': {}
+                            })
+                    except:
+                        pass
+            # Check script tags for JSON data that might contain file URLs
+            script_elements = soup.find_all('script', type='application/json')
+            for script in script_elements:
+                try:
+                    json_data = json.loads(script.string)
+                    # Look for URL patterns in the JSON data
+                    def extract_urls_from_json(obj, urls_found=None):
+                        if urls_found is None:
+                            urls_found = []
+                        if isinstance(obj, dict):
+                            for k, v in obj.items():
+                                # Check if any key contains url-like terms
+                                url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download']
+                                if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'):
+                                    urls_found.append(v)
+                                else:
+                                    extract_urls_from_json(v, urls_found)
+                        elif isinstance(obj, list):
+                            for item in obj:
+                                extract_urls_from_json(item, urls_found)
+                        return urls_found
+                    json_urls = extract_urls_from_json(json_data)
+                    for json_url in json_urls:
+                        if any(json_url.lower().endswith(ext) for ext in all_exts):
+                            found_files.append({
+                                'url': json_url,
+                                'filename': os.path.basename(json_url.split('?')[0]),
+                                'size': await self.get_file_size(json_url),
+                                'metadata': {}
+                            })
+                except:
+                    pass
+            # Check for hidden download buttons or forms
+            hidden_elements = await self.page.evaluate("""
+                () => {
+                    const results = [];
+                    // Check for hidden forms with download actions
+                    const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]');
+                    for (const form of forms) {
+                        const action = form.getAttribute('action') || '';
+                        results.push({
+                            type: 'form',
+                            action: action,
+                            inputs: Array.from(form.querySelectorAll('input[name]')).map(input => {
+                                return {name: input.name, value: input.value};
+                            })
+                        });
+                    }
+                    // Check for hidden download links/buttons
+                    const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => {
+                        const style = window.getComputedStyle(a);
+                        return (style.display === 'none' || style.visibility === 'hidden') &&
+                               (a.href.includes('download') || a.href.includes('file'));
+                    });
+                    for (const link of hiddenLinks) {
+                        results.push({
+                            type: 'link',
+                            href: link.href,
+                            text: link.innerText || link.textContent
+                        });
+                    }
+                    return results;
+                }
+            """)
+            # Process hidden elements
+            for elem in hidden_elements:
+                if elem['type'] == 'link' and 'href' in elem:
+                    href = elem['href']
+                    if any(href.lower().endswith(ext) for ext in all_exts):
+                        found_files.append({
+                            'url': href,
+                            'filename': os.path.basename(href.split('?')[0]),
+                            'size': await self.get_file_size(href),
+                            'metadata': {}
+                        })
+            # Deduplicate files by URL
             seen_urls = set()
             unique_files = []
             for f in found_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
+            traceback.print_exc()
             return []
     async def download_file(self, file_info, save_dir, referer):
                 logger.warning("All standard methods failed, attempting force download")
                 result_path = await self.force_download_viewonly(file_info, path)
                 return result_path if result_path else None
+            # Rotate proxy if needed
+            await self.rotate_proxy_if_needed()
+            # Try with direct requests first (faster)
+            try:
+                headers = {
+                    'User-Agent': get_random_user_agent(),
+                    'Accept': '*/*',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Referer': referer,
+                    'DNT': '1'
+                }
+                with requests.get(file_url, headers=headers, stream=True, timeout=30) as response:
+                    if response.status_code == 200:
+                        # Check content type to verify it's not HTML/error page
+                        content_type = response.headers.get('Content-Type', '')
+                        if 'text/html' in content_type and not file_url.endswith('.html'):
+                            logger.warning(f"Received HTML instead of expected file: {file_url}")
+                        else:
+                            with open(path, 'wb') as f:
+                                for chunk in response.iter_content(chunk_size=8192):
+                                    if chunk:
+                                        f.write(chunk)
+                            # Verify file was downloaded correctly
+                            if os.path.exists(path) and os.path.getsize(path) > 0:
+                                return path
+            except Exception as e:
+                logger.warning(f"Direct download failed: {e}, trying browser approach")
+            # Original code for non-Google Drive downloads using Playwright
             async with self.context.new_page() as page:
                 headers = {
                     'Accept': '*/*',
                     'Accept-Encoding': 'gzip, deflate, br',
                     'Referer': referer
                 }
+                # Try to download with timeout protection
+                try:
+                    response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000)
+                    if response.status == 200:
+                        content = await response.body()
+                        with open(path, 'wb') as f:
+                            f.write(content)
+                        return path
+                    else:
+                        logger.error(f"Download failed with status {response.status}: {file_url}")
+                        # Try to extract error information
+                        error_info = await response.text()
+                        logger.debug(f"Error response: {error_info[:200]}...")
+                        # Check if this might be a captcha or login issue
+                        if detect_captcha(error_info):
+                            logger.warning("Captcha detected during download")
+                            # For HF Spaces, we can't implement browser-based captcha solving here
+                            # Just log the issue for now
+                except PlaywrightTimeoutError:
+                    logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}")
+                # Try an alternative approach - using the browser's download manager
+                try:
+                    logger.info("Trying browser download manager approach")
+                    download_promise = page.wait_for_event("download")
+                    await page.goto(file_url, timeout=60000)
+                    # Wait for download to start (with timeout)
+                    download = await download_promise
+                    await download.save_as(path)
+                    if os.path.exists(path) and os.path.getsize(path) > 0:
+                        return path
+                except Exception as e:
+                    logger.error(f"Browser download manager approach failed: {e}")
+                return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
             logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
+            # Create a dedicated browser instance with better resolution and stealth
+            browser_args = [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-web-security',
+                '--disable-features=IsolateOrigins,site-per-process',
+                '--disable-site-isolation-trials',
+                '--disable-blink-features=AutomationControlled'  # Anti-detection
+            ]
             browser = await self.playwright.chromium.launch(
                 headless=True,
+                args=browser_args
             )
             # Use higher resolution for better quality
                 accept_downloads=True  # Critical for the download workflow
             )
+            # Add anti-detection script
+            await context.add_init_script("""
+                () => {
+                    Object.defineProperty(navigator, 'webdriver', {
+                        get: () => false,
+                    });
+                    // Change plugins
+                    Object.defineProperty(navigator, 'plugins', {
+                        get: () => [1, 2, 3, 4, 5].map(() => ({
+                            lengthComputable: true,
+                            loaded: 100,
+                            total: 100
+                        }))
+                    });
+                    // Handle languages
+                    Object.defineProperty(navigator, 'languages', {
+                        get: () => ['en-US', 'en', 'es']
+                    });
+                    // Modify hardware concurrency
+                    Object.defineProperty(navigator, 'hardwareConcurrency', {
+                        get: () => 4
+                    });
+                }
+            """)
             page = await context.new_page()
             try:
                 logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
                 await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
                 await page.wait_for_load_state('networkidle')
+                # Check for any barriers or permissions issues
+                content = await page.content()
+                if "the owner has not granted you permission to" in content:
+                    logger.warning("Permission denied error detected")
+                # Randomized wait to appear more human-like
+                await page.wait_for_timeout(random.randint(3000, 7000))
                 # Create temp directory
                 temp_dir = tempfile.mkdtemp()
                 if file_type.lower() == 'pdf':
                     # Use the improved scrolling and detection approach
+                    # Perform some natural mouse movements and scrolling
+                    await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400))
+                    await page.wait_for_timeout(random.randint(500, 1000))
+                    # Estimate number of pages
                     estimated_pages = await page.evaluate("""
                     () => {
                         // Method 1: Check page counter text
                     logger.info(f"Estimated {estimated_pages} pages in PDF")
+                    # Initial scroll to trigger lazy loading
+                    logger.info("Initial scroll to bottom to trigger lazy loading...")
                     await page.keyboard.press("End")
                     await page.wait_for_timeout(3000)
                     # Scroll page by page to ensure all pages are loaded
+                    logger.info("Scrolling page by page...")
                     max_attempts = min(estimated_pages * 3, 300)
                     attempt = 0
                     prev_blob_count = 0
                             logger.info("All pages appear to be loaded.")
                             break
+                        # Alternate between PageDown and End keys for more natural scrolling
+                        if attempt % 3 == 0:
+                            await page.keyboard.press("End")
+                        else:
+                            await page.keyboard.press("PageDown")
+                        # Randomized wait times
+                        await page.wait_for_timeout(random.randint(1500, 3000))
+                        # Move mouse randomly to appear more human-like
+                        if attempt % 4 == 0:
+                            await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800))
                         prev_blob_count = blob_count
                         attempt += 1
                     if not result.get('success', False):
                         logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
+                        # Try fallback approach - screenshot method
+                        logger.info("Trying fallback screenshot method...")
+                        # Navigate back to the first page
+                        await page.evaluate("""
+                            () => {
+                                // Find and click the "first page" button if available
+                                const buttons = Array.from(document.querySelectorAll('button'));
+                                const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page'));
+                                if (firstPageBtn) firstPageBtn.click();
+                            }
+                        """)
+                        await page.wait_for_timeout(1000);
+                        # Create a PDF by taking screenshots of each page
+                        screenshots = []
+                        current_page = 1
+                        max_pages = estimated_pages
+                        # Create a PDF using the reportlab package
+                        while current_page <= max_pages:
+                            screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png")
+                            # Try to find the current page element
+                            page_elem = await page.query_selector('.drive-viewer-paginated-page')
+                            if page_elem:
+                                await page_elem.screenshot(path=screenshot_path)
+                            else:
+                                # Fallback to full page screenshot
+                                await page.screenshot(path=screenshot_path)
+                            screenshots.append(screenshot_path)
+                            # Try to navigate to next page
+                            next_btn = await page.query_selector('button[aria-label="Next page"]')
+                            if next_btn:
+                                is_disabled = await next_btn.get_attribute('disabled')
+                                if is_disabled:
+                                    logger.info(f"Reached end of document at page {current_page}")
+                                    break
+                                await next_btn.click()
+                                await page.wait_for_timeout(1000)
+                                current_page += 1
+                            else:
+                                break
+                        # Create PDF from screenshots
+                        if screenshots:
+                            first_img = Image.open(screenshots[0])
+                            width, height = first_img.size
+                            c = canvas.Canvas(save_path, pagesize=(width, height))
+                            for screenshot in screenshots:
+                                img = Image.open(screenshot)
+                                c.drawImage(screenshot, 0, 0, width, height)
+                                c.showPage()
+                            c.save()
+                            # Clean up screenshots
+                            for screenshot in screenshots:
+                                os.remove(screenshot)
+                            return save_path
                         return None
                     logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
         # Try standard approaches for non-view-only files
         try:
+            # Try direct download link first (fastest)
+            direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
+            # Add anti-bot headers
+            headers = {
+                'User-Agent': get_random_user_agent(),
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Referer': 'https://drive.google.com/',
+                'DNT': '1'
+            }
+            # Try with streaming to handle larger files
+            with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r:
+                if r.status_code == 200:
+                    # Check if we got HTML instead of the file
+                    content_type = r.headers.get('Content-Type', '')
+                    if 'text/html' in content_type and not file_id.endswith('.html'):
+                        logger.warning("Received HTML instead of file, trying with session cookies")
+                    else:
+                        # Looks like we got the actual file
+                        with open(save_path, 'wb') as f:
+                            for chunk in r.iter_content(chunk_size=8192):
+                                if chunk:
+                                    f.write(chunk)
+                        # Verify file exists and has content
+                        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                            logger.info("Direct download successful")
+                            return True
             # Try with requests and session cookies
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
         except Exception as e:
             logger.warning(f"Requests session download failed: {e}")
+        # Try browser-based approach as last resort
+        try:
+            async with self.context.new_page() as page:
+                # Visit the file view page first to get cookies
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+                await page.wait_for_timeout(3000)
+                # Set up download event listener
+                download_promise = page.wait_for_event("download")
+                # Try to trigger the download button click
+                download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]')
+                if download_button:
+                    await download_button.click()
+                    # Wait for download to start
+                    try:
+                        download = await download_promise
+                        await download.save_as(save_path)
+                        return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                    except Exception as e:
+                        logger.error(f"Error during browser download: {e}")
+                        return False
+                else:
+                    # Try the export download URL
+                    await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000)
+                    # Look for and click any download buttons or links
+                    download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")')
+                    for elem in download_elements:
+                        try:
+                            await elem.click()
+                            # Wait a bit to see if download starts
+                            try:
+                                download = await download_promise
+                                await download.save_as(save_path)
+                                return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+                            except:
+                                pass
+                        except:
+                            continue
+        except Exception as e:
+            logger.error(f"Browser-based download attempt failed: {e}")
+        logger.warning("All standard download methods failed")
         return False
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
         """Download view-only PDF using the enhanced blob image caching technique"""
         try:
+            # Create a dedicated browser instance with stealth capabilities
+            browser_args = [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-web-security',
+                '--disable-blink-features=AutomationControlled'  # Anti-detection
+            ]
             browser = await self.playwright.chromium.launch(
                 headless=True,
+                args=browser_args
             )
+            # Setup stealth context
             context = await browser.new_context(
                 viewport={'width': 1600, 'height': 1200},
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                accept_downloads=True,  # Critical for handling the download event
+                ignore_https_errors=True
             )
+            # Add stealth script
+            await context.add_init_script("""
+                () => {
+                    Object.defineProperty(navigator, 'webdriver', {
+                        get: () => false,
+                    });
+                    // Change plugins and languages to appear more human
+                    Object.defineProperty(navigator, 'plugins', {
+                        get: () => [1, 2, 3, 4, 5].map(() => ({
+                            lengthComputable: true,
+                            loaded: 100,
+                            total: 100
+                        }))
+                    });
+                    Object.defineProperty(navigator, 'languages', {
+                        get: () => ['en-US', 'en', 'es']
+                    });
+                }
+            """)
             page = await context.new_page()
             try:
+                # Step 1: Navigate to the file with human-like behavior
                 logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
                 await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
                 await page.wait_for_load_state('networkidle')
+                # Perform human-like interactions
+                await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300))
+                await page.wait_for_timeout(random.randint(2000, 5000))
                 # Step 2: Estimate the number of pages
                 estimated_pages = await page.evaluate("""
                 await page.keyboard.press("End")
                 await page.wait_for_timeout(3000)
+                # Step 4: Wait for all pages to load with better feedback and randomization
+                logger.info("Scrolling through document to load all pages...")
+                max_attempts = min(estimated_pages * 3, 300)
                 attempt = 0
                 prev_blob_count = 0
+                consecutive_same_count = 0
                 while attempt < max_attempts:
                     # Count blob images (which are the PDF pages)
                     logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
+                    # Check if we've loaded all pages or if we're stuck
+                    if blob_count >= estimated_pages:
+                        logger.info(f"All {estimated_pages} pages appear to be loaded.")
                         break
+                    if blob_count == prev_blob_count:
+                        consecutive_same_count += 1
+                        if consecutive_same_count >= 5 and blob_count > 0:
+                            logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.")
+                            break
+                    else:
+                        consecutive_same_count = 0
+                    # Mix up the scrolling approach for more human-like behavior
+                    scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"])
+                    if scroll_action == "PageDown":
+                        await page.keyboard.press("PageDown")
+                    elif scroll_action == "End":
+                        await page.keyboard.press("End")
+                    elif scroll_action == "ArrowDown":
+                        # Press arrow down multiple times
+                        for _ in range(random.randint(5, 15)):
+                            await page.keyboard.press("ArrowDown")
+                            await page.wait_for_timeout(random.randint(50, 150))
+                    else:  # mouse
+                        # Scroll using mouse wheel
+                        current_y = random.randint(300, 700)
+                        await page.mouse.move(x=random.randint(300, 800), y=current_y)
+                        await page.mouse.wheel(0, random.randint(300, 800))
+                    # Random wait between scrolls
+                    await page.wait_for_timeout(random.randint(1000, 3000))
                     prev_blob_count = blob_count
                     attempt += 1
                                 try {
                                     let pdf = new jsPDF();
                                     let imgs = document.getElementsByTagName("img");
                                     let validImages = [];
+                                    // First collect all valid blob images
                                     for (let i = 0; i < imgs.length; i++) {
                                         let img = imgs[i];
                                         if (!/^blob:/.test(img.src)) continue;
                                         validImages.push(img);
                                     }
+                                    // Sort by position in the document
                                     validImages.sort((a, b) => {
                                         const rectA = a.getBoundingClientRect();
                                         const rectB = b.getBoundingClientRect();
                                     console.log(`Found ${validImages.length} valid page images to add to PDF`);
+                                    let added = 0;
                                     // Process each image as a page
                                     for (let i = 0; i < validImages.length; i++) {
                                         let img = validImages[i];
                     logger.info(f"Found {len(links)} sublinks with specialized method")
                     return list(links)[:limit]
+            # Rotate proxy if needed
+            await self.rotate_proxy_if_needed()
             # Standard sublink extraction for all sites
             await self.page.goto(url, timeout=30000, wait_until='networkidle')
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             path_base = os.path.dirname(parsed_base.path)
+            # Perform initial scrolling to load lazy content
+            await self.page.evaluate("""
+                async () => {
+                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                    const height = document.body.scrollHeight;
+                    const step = Math.floor(window.innerHeight / 2);
+                    for (let i = 0; i < height; i += step) {
+                        window.scrollTo(0, i);
+                        await delay(150);
+                    }
+                    window.scrollTo(0, 0);
+                }
+            """)
+            await self.page.wait_for_timeout(1000)
             # Check if page has ASP.NET elements which might need special handling
             is_aspnet = await self.page.evaluate('''
                 () => {
                 except Exception as e:
                     logger.warning(f"Error with postback: {e}")
+            # Look for pagination controls and try to navigate through them
+            pagination_elements = await self.page.query_selector_all(
+                'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]'
+            )
+            # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops)
+            for i in range(min(5, len(pagination_elements))):
+                try:
+                    # Focus on elements that look like "next page" buttons
+                    el = pagination_elements[i]
+                    el_text = await el.text_content() or ""
+                    # Only click if this looks like a pagination control
+                    if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip():
+                        logger.info(f"Clicking pagination control: {el_text}")
+                        await el.click()
+                        await self.page.wait_for_timeout(2000)
+                        await self.page.wait_for_load_state('networkidle', timeout=5000)
+                        # Get new links from this page
+                        await self.extract_all_link_types(links, base_url, path_base)
+                except Exception as e:
+                    logger.warning(f"Error clicking pagination: {e}")
+            # Check for hidden links that might be revealed by JavaScript
+            hidden_links = await self.page.evaluate("""
+                () => {
+                    // Try to execute common JavaScript patterns that reveal hidden content
+                    try {
+                        // Common patterns used in websites to initially hide content
+                        const hiddenContainers = document.querySelectorAll(
+                            '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]'
+                        );
+                        // Attempt to make them visible
+                        hiddenContainers.forEach(el => {
+                            el.style.display = 'block';
+                            el.style.visibility = 'visible';
+                            el.classList.remove('hidden', 'hide');
+                        });
+                        // Return any newly visible links
+                        return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
+                    } catch (e) {
+                        return [];
+                    }
+                }
+            """)
+            # Add any newly discovered links
+            for href in hidden_links:
+                if href and not href.startswith('javascript:'):
+                    links.add(href)
             logger.info(f"Found {len(links)} sublinks")
             return list(links)[:limit]
                     links_set.add(full_url)
             except Exception:
                 pass
+        # Extract links from JSON data embedded in the page
+        script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]')
+        for script in script_elements:
+            try:
+                script_content = await script.text_content()
+                if script_content:
+                    # Look for URLs in the JSON content
+                    urls = re.findall(r'(https?://[^\'"]+)', script_content)
+                    for url in urls:
+                        links_set.add(url)
+            except Exception:
+                pass
     def resolve_relative_url(self, relative_url, base_url, path_base):
         """Properly resolve relative URLs considering multiple formats"""
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
+            # Always include files from the main page, regardless of sublinks
+            all_files = main_files
             if not sublinks:
                 progress_bar.progress(1.0)
+                return all_files
             # Process each sublink
             for i, sublink in enumerate(sublinks, 1):
                 progress = i / total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
             sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
+            use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox")
         with st.expander("Google Drive Integration", expanded=False):
             if st.button("Start Google Sign-In", key="google_signin_btn"):
                 creds, msg = exchange_code_for_credentials(auth_code)
                 st.session_state.google_creds = creds
                 st.write(msg)
+        with st.expander("Advanced Browser Settings", expanded=False):
+            # Captcha handling options
+            st.write("**Captcha Handling**")
+            captcha_option = st.radio(
+                "Captcha Detection:",
+                ["Auto-detect only", "Manual solve (shows captcha)"],
+                index=0,
+                key="captcha_option"
+            )
+            # Proxy rotation settings
+            st.write("**Proxy Rotation**")
+            enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation")
+            if enable_rotation:
+                PROXY_ROTATION_CONFIG["enabled"] = True
+                proxy_list = st.text_area(
+                    "Proxy List (one per line)",
+                    placeholder="http://proxy1:port\nhttp://proxy2:port",
+                    key="proxy_list"
+                )
+                if proxy_list:
+                    PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()]
+                rotation_interval = st.slider(
+                    "Rotation Interval (# of requests)",
+                    min_value=1,
+                    max_value=50,
+                    value=10,
+                    key="rotation_interval"
+                )
+                PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
     if mode == "Manual URL":
         st.header("Manual URL Mode")
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
                     @st.cache_resource
+                    def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
                         async def _run():
+                            async with DownloadManager(
+                                use_proxy=use_proxy_val,
+                                proxy=proxy_val,
+                                use_stealth=use_stealth_val
+                            ) as dm:
                                 files = await dm.deep_search(url, ext_list, max_links, timeout_val)
                                 return files
                         return asyncio.run(_run())
                     with st.spinner("Searching for files..."):
                         files = run_deep_search(url, valid_ext_list, max_sublinks,
+                                              sublink_timeout, use_proxy, proxy, use_stealth)
                     if files:
                         st.session_state.discovered_files = files
                         progress_bar = st.progress(0)
                         status_text = st.empty()
+                        async with DownloadManager(
+                            use_proxy=use_proxy,
+                            proxy=proxy,
+                            use_stealth=use_stealth
+                        ) as dm:
                             for i, idx in enumerate(selected_indices):
                                 progress = (i + 1) / len(selected_indices)
                                 file_info = files[idx]
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
+                    async with DownloadManager(
+                        use_proxy=use_proxy,
+                        proxy=proxy,
+                        query=query,
+                        num_results=num_results,
+                        use_stealth=use_stealth
+                    ) as dm:
                         with st.spinner("Searching..."):
                             urls = await dm.search_bing()
                             if urls:
                 valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
                 @st.cache_resource
+                def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
                     async def _run():
+                        async with DownloadManager(
+                            use_proxy=use_proxy_val,
+                            proxy=proxy_val,
+                            use_stealth=use_stealth_val
+                        ) as dm:
                             files = await dm.deep_search(url, ext_list, max_links, timeout_val)
                             return files
                     return asyncio.run(_run())
                 with st.spinner("Searching for files..."):
                     files = run_deep_search(url, valid_ext_list, max_sublinks,
+                                           sublink_timeout, use_proxy, proxy, use_stealth)
                 if files:
                     st.session_state.discovered_files = files
             with st.spinner("Downloading view-only document... (this may take a minute)"):
                 async def download_viewonly():
+                    async with DownloadManager(use_stealth=use_stealth) as dm:
                         file_info = {
                             'url': f"https://drive.google.com/file/d/{file_id}/view",
                             'filename': f"gdrive_{file_id}.pdf",
                 if result:
                     st.success("Document downloaded successfully!")
+                    # Provide download button
                     with open(result, "rb") as f:
                         file_bytes = f.read()
                     st.download_button(
                         label="Download PDF",
                         data=file_bytes,
+                        file_name=f"gdrive_{file_id}.pdf",
                         mime="application/pdf"
                     )
                 else:
     # Add footer with attribution
     st.markdown('---')
+    st.markdown('Created by [Euler314](https://github.com/euler314)')
 if __name__ == "__main__":
     main()