import os import re import random import asyncio import logging import traceback import tempfile import shutil import json import time from urllib.parse import urlparse, urljoin, unquote, parse_qs from io import BytesIO from bs4 import BeautifulSoup import PyPDF2 import requests from PIL import Image from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError from app.utils import ( get_random_user_agent, sizeof_fmt, get_domain, is_download_link, normalize_download_url, detect_captcha, USER_AGENTS, STEALTH_SETTINGS, PROXY_ROTATION_CONFIG ) logger = logging.getLogger(__name__) class DownloadManager: def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): self.use_proxy = use_proxy self.proxy = proxy self.query = query self.num_results = num_results self.playwright = None self.browser = None self.context = None self.page = None self.use_stealth = use_stealth self.proxy_rotation = proxy_rotation self.request_count = 0 self.captcha_detected = False self.download_timeout = 300 # 5 minutes timeout for downloads # Track visited URLs to avoid revisiting the same URL multiple times self.visited_urls = set() # Track successfully downloaded files to avoid redownloading self.downloaded_files = set() async def __aenter__(self): self.playwright = await async_playwright().start() # Prepare browser args with stealth settings browser_args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-zygote', '--single-process', '--disable-web-security', '--disable-features=IsolateOrigins', '--disable-site-isolation-trials' ] # Add stealth-specific args if self.use_stealth: browser_args.extend([ '--disable-blink-features=AutomationControlled', '--disable-features=IsolateOrigins,site-per-process', '--disable-webgl', '--disable-webrtc' ]) # Setup browser options opts = { "headless": True, "args": browser_args } # Configure proxy if specified if self.use_proxy and self.proxy: opts["proxy"] = {"server": self.proxy} # Launch browser with options self.browser = await self.playwright.chromium.launch(**opts) # Setup browser context with enhanced settings context_opts = { "user_agent": get_random_user_agent(), "viewport": {"width": 1920, "height": 1080}, "device_scale_factor": 1, "has_touch": False, "is_mobile": False, "ignore_https_errors": True, "accept_downloads": True } # Apply stealth-specific settings to the context if self.use_stealth: # Apply JS-injection for enhanced stealth context_opts["bypass_csp"] = True self.context = await self.browser.new_context(**context_opts) # Execute stealth JS to avoid detection await self.context.add_init_script(""" () => { Object.defineProperty(navigator, 'webdriver', { get: () => false, }); // Change navigator properties const newProto = navigator.__proto__; delete newProto.webdriver; // Overwrite the plugins Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5].map(() => ({ lengthComputable: true, loaded: 100, total: 100 })) }); // Handle languages more naturally Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en', 'es'] }); // Modify hardware concurrency Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 }); // Modify deviceMemory Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); // WebGL modifications const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { if (parameter === 37445) { return 'Intel Inc.'; } if (parameter === 37446) { return 'Intel Iris OpenGL Engine'; } return getParameter.apply(this, arguments); }; } """) else: # Regular context without stealth self.context = await self.browser.new_context(**context_opts) # Create page with enhanced headers self.page = await self.context.new_page() await self.page.set_extra_http_headers({ 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'DNT': '1', # Do Not Track 'Referer': 'https://www.google.com/', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1' }) # Add delay for mouse movements to simulate human behavior if self.use_stealth: await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) await self.page.wait_for_timeout(random.randint(200, 500)) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.browser: await self.browser.close() if self.playwright: await self.playwright.stop() async def rotate_proxy_if_needed(self): """Rotate proxy if proxy rotation is enabled and threshold is reached""" if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: self.request_count += 1 if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: # Get next proxy from the pool next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list # Close existing context and create new one with the new proxy if self.context: await self.context.close() # Create new context with the new proxy context_opts = { "user_agent": get_random_user_agent(), "proxy": {"server": next_proxy}, "accept_downloads": True } self.context = await self.browser.new_context(**context_opts) self.page = await self.context.new_page() # Reset counter self.request_count = 0 logger.info(f"Rotated to new proxy: {next_proxy}") async def handle_captcha(self, page): """Detect and handle captchas if possible""" # Check for common captcha patterns content = await page.content() if detect_captcha(content): self.captcha_detected = True logger.warning("Captcha detected on page") # Strategies for handling captchas: # 1. For simple captchas, try to extract the image and solve it captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') if captcha_img: logger.info("Found captcha image, attempting to capture") # Take screenshot of the captcha captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") await captcha_img.screenshot(path=captcha_path) # In a real implementation, you would send this to a captcha solving service # For now, just log the detection logger.info(f"Captcha image saved to {captcha_path}") # For demonstration, we'll notify the user but not actually solve it return False # 2. For reCAPTCHA, special handling would be required recaptcha = await page.query_selector('iframe[src*="recaptcha"]') if recaptcha: logger.warning("reCAPTCHA detected, would require external solving service") return False # 3. Try to perform human-like actions that might bypass simple bot checks await self.perform_human_actions(page) # Check if captcha is still present content = await page.content() if detect_captcha(content): logger.warning("Captcha still present after human-like actions") return False else: logger.info("Captcha appears to be resolved") return True return True # No captcha detected async def perform_human_actions(self, page): """Perform human-like actions on the page to possibly bypass simple bot checks""" try: # 1. Slowly scroll down the page for i in range(3): await page.evaluate(f"window.scrollTo(0, {i * 300})") await page.wait_for_timeout(random.randint(300, 700)) # 2. Random mouse movements for _ in range(3): x = random.randint(100, 800) y = random.randint(100, 600) await page.mouse.move(x=x, y=y) await page.wait_for_timeout(random.randint(200, 500)) # 3. Click on a non-essential part of the page try: await page.click("body", position={"x": 50, "y": 50}) except: pass # 4. Wait a bit before continuing await page.wait_for_timeout(1000) except Exception as e: logger.warning(f"Error during human-like actions: {e}") async def search_bing(self): urls = [] try: # Rotate proxy if needed await self.rotate_proxy_if_needed() search_url = f"https://www.bing.com/search?q={self.query}" await self.page.goto(search_url, timeout=30000) await self.page.wait_for_load_state('networkidle') # Check for captchas if not await self.handle_captcha(self.page): logger.warning("Captcha detected during search, results may be limited") # More natural scrolling behavior for i in range(3): await self.page.evaluate(f"window.scrollTo(0, {i * 400})") await self.page.wait_for_timeout(random.randint(300, 800)) # Extract search results links = await self.page.query_selector_all("li.b_algo h2 a") for link in links[:self.num_results]: href = await link.get_attribute('href') if href: urls.append(href) # If we didn't find enough results, try an alternative selector if len(urls) < self.num_results: alt_links = await self.page.query_selector_all(".b_caption a") for link in alt_links: href = await link.get_attribute('href') if href and href not in urls: urls.append(href) if len(urls) >= self.num_results: break return urls except Exception as e: logger.error(f"Error searching Bing: {e}") return [] async def get_file_size(self, url): try: await self.rotate_proxy_if_needed() # For complex download URLs, we need to be careful with HEAD requests if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: # For these URLs, we'll try a more reliable approach using range headers headers = { 'User-Agent': get_random_user_agent(), 'Range': 'bytes=0-0' # Just request the first byte to check headers } try: with requests.get(url, headers=headers, stream=True, timeout=10) as r: if 'Content-Range' in r.headers: content_range = r.headers['Content-Range'] match = re.search(r'bytes 0-0/(\d+)', content_range) if match: size = int(match.group(1)) return sizeof_fmt(size) if 'Content-Length' in r.headers: size = int(r.headers['Content-Length']) # If size is 1, it's likely just our single requested byte if size > 1: return sizeof_fmt(size) except Exception as e: logger.warning(f"Error getting file size with Range request: {e}") # Fallback to browser approach try: async with self.context.new_page() as page: response = await page.request.head(url, timeout=15000) length = response.headers.get('Content-Length', None) if length: return sizeof_fmt(int(length)) except Exception as e: logger.warning(f"Error getting file size with browser: {e}") return "Unknown Size" else: # Standard approach for normal URLs async with self.context.new_page() as page: response = await page.request.head(url, timeout=15000) length = response.headers.get('Content-Length', None) if length: return sizeof_fmt(int(length)) else: return "Unknown Size" except Exception as e: logger.warning(f"Error getting file size: {e}") return "Unknown Size" async def get_pdf_metadata(self, url): try: await self.rotate_proxy_if_needed() async with self.context.new_page() as page: resp = await page.request.get(url, timeout=15000) if resp.ok: content = await resp.body() pdf = BytesIO(content) reader = PyPDF2.PdfReader(pdf) return { 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', 'Pages': len(reader.pages), } else: return {} except Exception as e: logger.warning(f"Error reading PDF metadata: {e}") return {} async def extract_real_download_url(self, url): """Enhanced method to extract real download URL, handling complex URLs""" try: # Check if this is a complex download URL that needs special handling if 'Action=downloadfile' in url or 'fname=' in url: logger.info(f"Complex download URL detected: {url}") # For these special cases, we'll use the browser to navigate and intercept redirects await self.rotate_proxy_if_needed() async with self.context.new_page() as page: # Set up request interception to capture redirects await page.route('**', lambda route: route.continue_()) # Listen for all responses responses = [] page.on('response', lambda response: responses.append(response)) try: # Go to the URL await page.goto(url, wait_until='networkidle', timeout=30000) # Check all responses for potential downloads for response in responses: # Look for content-disposition headers indicating a download content_disposition = response.headers.get('Content-Disposition', '') if 'attachment' in content_disposition or 'filename=' in content_disposition: return response.url # Look for content-type headers indicating a file content_type = response.headers.get('Content-Type', '') if content_type and content_type != 'text/html' and not content_type.startswith('text/'): return response.url # If no clear download was detected, return the final URL return page.url except Exception as e: logger.warning(f"Error extracting real download URL: {e}") return url else: # Standard approach for normal URLs await self.rotate_proxy_if_needed() async with self.context.new_page() as page: response = await page.goto(url, wait_until='networkidle', timeout=30000) if response and response.headers.get('location'): return response.headers['location'] return page.url except Exception as e: logger.error(f"Error extracting real download URL: {e}") return url # IMPROVED: Enhanced exam links extraction method async def get_edu_exam_links(self, url): """Specialized method for educational exam websites that follows a common pattern.""" try: logger.info(f"Fetching exam links from {url}") links = set() # First try with direct requests for speed (but with proper headers) headers = { "User-Agent": get_random_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Referer": "https://www.google.com/", "DNT": "1" } try: response = requests.get(url, headers=headers, timeout=30) if response.status_code == 200: # Parse with BeautifulSoup first for efficiency soup = BeautifulSoup(response.text, "html.parser") parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" # Look for all links for a in soup.find_all("a", href=True): href = a["href"] full_url = urljoin(url, href) # Look for text clues link_text = a.get_text().lower() # Special patterns for exam sites (expanded list) url_patterns = [ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", "/test/", "/download/", "/files/", "/assignments/", "paper_", "question_", "exam_", "test_", "past_", "assignment_", "sample_", "study_material", "notes_", "/resource/", "/subject/", "/course/", "/material/" ] text_patterns = [ "exam", "paper", "test", "question", "past", "download", "assignment", "sample", "study", "material", "notes", "subject", "course", "resource", "pdf", "document", "view", "open", "get", "solution", "answer" ] # Check URL for patterns if any(pattern in full_url.lower() for pattern in url_patterns): links.add(full_url) continue # Check link text for patterns if any(pattern in link_text for pattern in text_patterns): links.add(full_url) continue # Check for common file extensions if any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(full_url) # Check for download script parameters if "Action=downloadfile" in url or "fname=" in url: links.add(url) # Add the URL itself as it's a download link except Exception as e: logger.warning(f"Request-based extraction failed: {e}") # Browser-based approach for more thorough extraction or if initial approach was inadequate try: # Check if we need to proceed with browser-based extraction if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: logger.info("Using browser for enhanced link extraction") # Rotate proxy if needed await self.rotate_proxy_if_needed() # Navigate to the page with more natural timing await self.page.goto(url, timeout=45000, wait_until='networkidle') await self.page.wait_for_timeout(random.randint(1000, 2000)) # Handle captchas if present if not await self.handle_captcha(self.page): logger.warning("Captcha detected, extraction may be limited") # Get base URL for resolving relative links parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" # Perform natural scrolling to trigger lazy-loaded content page_height = await self.page.evaluate("document.body.scrollHeight") viewport_height = await self.page.evaluate("window.innerHeight") for scroll_pos in range(0, page_height, viewport_height // 2): await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") await self.page.wait_for_timeout(random.randint(300, 800)) # Scroll back to top await self.page.evaluate("window.scrollTo(0, 0)") await self.page.wait_for_timeout(500) # Extract all links with Playwright (better than just anchor tags) all_links = await self.page.evaluate(""" () => { const results = []; // Get all anchor tags const anchors = document.querySelectorAll('a[href]'); for (const a of anchors) { if (a.href) { results.push({ href: a.href, text: a.innerText || a.textContent || '', isButton: a.classList.contains('btn') || a.role === 'button' }); } } // Get buttons that might contain links const buttons = document.querySelectorAll('button'); for (const btn of buttons) { const onclick = btn.getAttribute('onclick') || ''; if (onclick.includes('window.location') || onclick.includes('download')) { results.push({ href: '#button', text: btn.innerText || btn.textContent || '', isButton: true, onclick: onclick }); } } return results; } """) # Process the extracted links for link_info in all_links: href = link_info.get('href', '') text = link_info.get('text', '').lower() if href and href != '#button': # Check URL patterns url_patterns = [ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", "/test/", "/download/", "/files/", "/assignments/", "paper_", "question_", "exam_", "test_", "past_", "assignment_", "sample_", "study_material", "notes_" ] # Check text patterns text_patterns = [ "exam", "paper", "test", "question", "past", "download", "assignment", "sample", "study", "material", "notes", "pdf", "document", "view", "open", "solution" ] if any(pattern in href.lower() for pattern in url_patterns) or \ any(pattern in text for pattern in text_patterns) or \ any(href.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(href) # Check for download links in the page download_links = await self.page.evaluate(""" () => { // Find all links that might be download links const links = Array.from(document.querySelectorAll('a[href]')); return links .filter(a => { const href = a.href.toLowerCase(); return href.includes('download') || href.includes('getfile') || href.includes('view.php') || href.includes('action=downloadfile') || href.includes('fname='); }) .map(a => a.href); } """) for dl_link in download_links: links.add(dl_link) # Check for ASP.NET specific elements that might contain exam links grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') for grid in grid_elements: grid_links = await grid.query_selector_all('a[href]') for a in grid_links: href = await a.get_attribute('href') text = await a.text_content() if href: full_url = href if href.startswith('http') else urljoin(url, href) links.add(full_url) # Try clicking pagination controls to reveal more content pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons try: # Check if this is a numeric pagination button (more likely to be useful) button_text = await button.text_content() if button_text and button_text.strip().isdigit(): logger.info(f"Clicking pagination button: {button_text}") await button.click() await self.page.wait_for_timeout(2000) await self.page.wait_for_load_state('networkidle', timeout=10000) # Extract links from this page new_page_links = await self.page.evaluate(""" () => { return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); } """) for href in new_page_links: if href and not href.startswith('javascript:'): if any(pattern in href.lower() for pattern in url_patterns) or \ any(href.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(href) except Exception as e: logger.warning(f"Error clicking pagination button: {e}") # Try clicking any controls that might reveal more exam links (more focused approach) show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') for button in show_buttons: button_text = (await button.text_content() or "").lower() button_value = (await button.get_attribute("value") or "").lower() button_id = (await button.get_attribute("id") or "").lower() # Look for buttons that seem likely to reveal file lists promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", "download", "resource", "material", "browse", "file"] if any(term in button_text or term in button_value or term in button_id for term in promising_terms): try: logger.info(f"Clicking button: {button_text or button_value}") await button.click() await self.page.wait_for_timeout(2000) await self.page.wait_for_load_state('networkidle', timeout=10000) # Get any new links that appeared new_links = await self.page.query_selector_all('a[href]') for a in new_links: href = await a.get_attribute('href') if href: full_url = href if href.startswith('http') else urljoin(url, href) # Focus on file extensions and patterns if any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ any(pattern in full_url.lower() for pattern in url_patterns): links.add(full_url) except Exception as e: logger.warning(f"Error clicking button: {e}") # Special handling for ASP.NET PostBack links try: # Find and interact with ASP.NET __doPostBack elements postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks try: onclick = await element.get_attribute('onclick') if onclick and '__doPostBack' in onclick: element_text = await element.text_content() # Only interact with elements that seem likely to contain exam links promising_terms = ["show", "view", "list", "exam", "paper", "test", "download", "resource", "material"] if any(term in element_text.lower() for term in promising_terms): logger.info(f"Clicking ASP.NET postback element: {element_text}") # Click the element await element.click() await self.page.wait_for_timeout(2000) await self.page.wait_for_load_state('networkidle', timeout=10000) # Extract any new links new_links = await self.page.query_selector_all('a[href]') for a in new_links: href = await a.get_attribute('href') if href: full_url = href if href.startswith('http') else urljoin(url, href) if any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(full_url) except Exception as e: logger.warning(f"Error interacting with postback element: {e}") except Exception as e: logger.warning(f"Error during postback handling: {e}") except Exception as e: logger.error(f"Browser-based extraction failed: {e}") # Filter links to likely contain exam documents filtered_links = [] for link in links: # Common file extensions for exam documents if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): filtered_links.append(link) continue # Common paths for exam documents if any(pattern in link.lower() for pattern in [ "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", "/resource/", "/material/", "/notes/", "/subjectmaterial/" ]): filtered_links.append(link) continue # Check for download links (these may not have obvious extensions) if is_download_link(link): filtered_links.append(link) logger.info(f"Found {len(filtered_links)} potential exam document links") return filtered_links except Exception as e: logger.error(f"Error getting exam links: {e}") return [] async def discover_hidden_links(self, page): """Discover hidden links that might be in JavaScript, iframes, or dynamic content""" hidden_links = set() # Execute JavaScript to find links in script tags and data attributes js_links = await page.evaluate(""" () => { const links = new Set(); // Extract URLs from script tags const scripts = document.querySelectorAll('script'); for (const script of scripts) { const content = script.textContent || ''; const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; for (let match of urlMatches) { links.add(match.replace(/["']/g, '')); } } // Look for download-related variables in scripts for (const script of scripts) { const content = script.textContent || ''; // Look for common patterns for file URLs in JavaScript if (content.includes('downloadURL') || content.includes('fileURL') || content.includes('pdfURL') || content.includes('documentURL')) { // Extract potential URLs const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; for (let match of potentialUrls) { const url = match.replace(/["']/g, ''); // Try to resolve relative URLs if (url.startsWith('/') || !url.includes('://')) { if (url.startsWith('/')) { links.add(window.location.origin + url); } else { // Handle relative paths more carefully const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); links.add(base + url); } } else if (url.startsWith('http')) { links.add(url); } } } } // Check for links in data attributes const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); for (const el of elements) { for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { const val = el.getAttribute(attr); if (val) { // Try to resolve relative URLs if (val.startsWith('/')) { links.add(window.location.origin + val); } else if (val.startsWith('http')) { links.add(val); } else if (!val.startsWith('javascript:') && !val.startsWith('#')) { // Handle relative paths const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); links.add(base + val); } } } } // Look for URLs in inline event handlers const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); for (const el of clickableElements) { for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { const val = el.getAttribute(attr); if (val) { // Check for JavaScript URLs with window.location if (val.includes('window.location') || val.includes('document.location')) { const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); if (urlMatch && urlMatch[1]) { const url = urlMatch[1]; if (url.startsWith('/')) { links.add(window.location.origin + url); } else if (url.startsWith('http')) { links.add(url); } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); links.add(base + url); } } } // Check for direct URLs in attributes const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; for (let match of urlMatches) { links.add(match.replace(/["']/g, '')); } // Check for download.php and similar patterns if (val.includes('download.php') || val.includes('getfile.php') || val.includes('Action=downloadfile') || val.includes('viewfile.php')) { // Handle both onclick handlers and direct hrefs let url = ''; if (attr === 'href') { url = val; } else { // Extract URL from JavaScript const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); if (jsUrlMatch) { url = jsUrlMatch[1]; } } // Resolve URL if needed if (url) { if (url.startsWith('/')) { links.add(window.location.origin + url); } else if (url.startsWith('http')) { links.add(url); } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); links.add(base + url); } } } } } } // Find PHP/ASP file download links const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); for (const link of fileLinks) { links.add(link.href); } return Array.from(links); } """) for link in js_links: hidden_links.add(link) # Extract links from iframes iframes = await page.query_selector_all('iframe') for iframe in iframes: try: frame = await iframe.content_frame() if frame: iframe_links = await frame.evaluate(""" () => { return Array.from(document.querySelectorAll('a[href]')) .map(a => a.href) .filter(href => href.startsWith('http')); } """) for link in iframe_links: hidden_links.add(link) except Exception as e: logger.warning(f"Could not extract links from iframe: {e}") # Look for links in shadow DOM (used in modern web components) shadow_links = await page.evaluate(""" () => { const links = new Set(); // Helper function to recursively process shadow roots function processShadowRoot(root) { if (!root) return; // Get links in this shadow root const shadowLinks = root.querySelectorAll('a[href]'); for (const link of shadowLinks) { if (link.href && link.href.startsWith('http')) { links.add(link.href); } } // Process nested shadow roots const elements = root.querySelectorAll('*'); for (const el of elements) { if (el.shadowRoot) { processShadowRoot(el.shadowRoot); } } } // Find all shadow roots in the document const elements = document.querySelectorAll('*'); for (const el of elements) { if (el.shadowRoot) { processShadowRoot(el.shadowRoot); } } return Array.from(links); } """) for link in shadow_links: hidden_links.add(link) # Look for download links in forms form_links = await page.evaluate(""" () => { const links = new Set(); // Check for form actions that might be download endpoints const forms = document.querySelectorAll('form'); for (const form of forms) { const action = form.action || ''; if (action && ( action.includes('download') || action.includes('getfile') || action.includes('viewfile') || action.includes('Action=downloadfile') )) { // Collect input values that might be needed for the download const inputs = {}; const formInputs = form.querySelectorAll('input[name]'); for (const input of formInputs) { inputs[input.name] = input.value; } // Store both the form action and any important inputs links.add(action); } } return Array.from(links); } """) for link in form_links: hidden_links.add(link) return hidden_links async def extract_downloadable_files(self, url, custom_ext_list): found_files = [] try: # Normalize the URL to handle special cases normalized_url = normalize_download_url(url) # Skip if we've already visited this URL if normalized_url in self.visited_urls: logger.info(f"Skipping already visited URL: {normalized_url}") return [] # Mark this URL as visited self.visited_urls.add(normalized_url) # Rotate proxy if needed await self.rotate_proxy_if_needed() # First check if this is a direct download link (Action=downloadfile or fname parameter) if is_download_link(normalized_url): logger.info(f"Processing potential direct download link: {normalized_url}") # Try to extract the real download URL if needed real_url = await self.extract_real_download_url(normalized_url) # Determine filename - for complex URLs this can be tricky filename = os.path.basename(urlparse(real_url).path) # Handle URL-encoded filenames if '%' in filename: try: filename = unquote(filename) except Exception: pass # For URLs with download parameters, try to extract filename from query if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): # Look for file parameter params = parse_qs(urlparse(normalized_url).query) # Check common filename parameters for param in ['file', 'filename', 'name', 'fname', 'f']: if param in params and params[param]: potential_filename = params[param][0] if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: filename = os.path.basename(potential_filename) break # If still no valid filename, use domain-based fallback if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): domain = get_domain(real_url) # Try to determine file type from content-type or extension hints in URL ext = '.pdf' # Default for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: if common_ext in normalized_url.lower(): ext = common_ext break filename = f"file_from_{domain}{ext}" # Get file size size_str = await self.get_file_size(real_url) # Add to found files found_files.append({ 'url': real_url, 'filename': filename, 'size': size_str, 'metadata': {}, 'download_url': normalized_url # Keep original URL for downloading }) # For direct download links, we can return early if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): return found_files # Special handling for educational exam sites if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in ["exam", "test", "pastpaper", "eduexp"]): logger.info("Using specialized handler for educational exam site") # Get direct links to exam files exam_links = await self.get_edu_exam_links(url) for link in exam_links: # Try to resolve any redirection real_url = await self.extract_real_download_url(link) filename = os.path.basename(urlparse(real_url).path) # If filename is URL encoded (common with Chinese/international sites) if '%' in filename: try: filename = unquote(filename) except Exception: pass # If filename is empty or invalid, create a sensible one if not filename or filename == '/': domain = get_domain(real_url) ext = '.pdf' # Default for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: if common_ext in link.lower(): ext = common_ext break filename = f"file_from_{domain}{ext}" # Get file size size_str = await self.get_file_size(real_url) # Get metadata for PDFs meta = {} if real_url.lower().endswith('.pdf'): try: meta = await self.get_pdf_metadata(real_url) except Exception: pass found_files.append({ 'url': real_url, 'filename': filename, 'size': size_str, 'metadata': meta, 'download_url': link # Store original link for downloading }) # If we found exam files with the specialized method, return them if found_files: return found_files # Standard extraction method if specialized method didn't find files response = await self.page.goto(url, timeout=30000, wait_until='networkidle') if not response: return [] # Check for captchas if not await self.handle_captcha(self.page): logger.warning("Captcha detected, file extraction may be limited") # Scroll through the page naturally to trigger lazy loading await self.page.evaluate(""" (async () => { const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); const height = document.body.scrollHeight; const scrollStep = Math.floor(window.innerHeight / 2); for (let i = 0; i < height; i += scrollStep) { window.scrollTo(0, i); await delay(100); } window.scrollTo(0, 0); })() """) await self.page.wait_for_timeout(1000) final_url = self.page.url if '.php' in final_url or 'download' in final_url: real_url = await self.extract_real_download_url(final_url) if real_url != final_url: # Try to detect the filename from headers or URL response = await self.page.request.head(real_url, timeout=15000) filename = None # Try to get from Content-Disposition header content_disposition = response.headers.get('Content-Disposition', '') if 'filename=' in content_disposition: filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) if filename_match: filename = filename_match.group(1) # If not found in headers, use URL basename if not filename: filename = os.path.basename(urlparse(real_url).path) if not filename or filename == '/': # Generate a name based on domain domain = get_domain(real_url) ext = '.pdf' # Default for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: if common_ext in real_url.lower(): ext = common_ext break filename = f"file_from_{domain}{ext}" found_files.append({ 'url': real_url, 'filename': filename, 'size': await self.get_file_size(real_url), 'metadata': {}, 'download_url': final_url # Keep original URL for downloading }) return found_files await self.page.wait_for_load_state('networkidle', timeout=30000) content = await self.page.content() soup = BeautifulSoup(content, 'html.parser') default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.pptx', '.odt', '.txt'] all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) parsed_base = urlparse(final_url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" path_base = os.path.dirname(parsed_base.path) # Process all anchor tags for a in soup.find_all('a', href=True): href = a['href'].strip() if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) real_url = await self.extract_real_download_url(full_url) if real_url and real_url != full_url: found_files.append({ 'url': real_url, 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', 'size': await self.get_file_size(real_url), 'metadata': {}, 'download_url': full_url # Original URL for download }) continue if any(href.lower().endswith(ext) for ext in all_exts): file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) size_str = await self.get_file_size(file_url) meta = {} if file_url.lower().endswith('.pdf'): meta = await self.get_pdf_metadata(file_url) found_files.append({ 'url': file_url, 'filename': os.path.basename(file_url.split('?')[0]), 'size': size_str, 'metadata': meta, 'download_url': file_url # Same as URL for direct links }) # Handle Google Drive links elif ("drive.google.com" in href) or ("docs.google.com" in href): file_id = None for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: match = re.search(pattern, href) if match: file_id = match.group(1) break if file_id: # Get file info to determine type and view-only status file_type, is_view_only = await self.get_google_drive_file_info(file_id) # Create a more informative filename based on info filename = f"gdrive_{file_id}" if file_type: filename = f"{filename}.{file_type}" size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") found_files.append({ 'url': href, # Use original URL 'filename': filename, 'size': size_str, 'metadata': { 'view_only': is_view_only, 'file_type': file_type, 'file_id': file_id }, 'download_url': href # Same as URL for Google Drive }) # Also check for files in other elements (iframe, embed, object, etc.) other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) for elem in other_elements: src = elem.get('src') or elem.get('data') if src and any(src.lower().endswith(ext) for ext in all_exts): file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) size_str = await self.get_file_size(file_url) meta = {} if file_url.lower().endswith('.pdf'): meta = await self.get_pdf_metadata(file_url) found_files.append({ 'url': file_url, 'filename': os.path.basename(file_url.split('?')[0]), 'size': size_str, 'metadata': meta, 'download_url': file_url }) # Check for file links in onclick attributes onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') for elem in onclick_elements: onclick = await elem.get_attribute('onclick') urls = re.findall(r'(https?://[^\'"]+)', onclick) for url_match in urls: if any(url_match.lower().endswith(ext) for ext in all_exts): size_str = await self.get_file_size(url_match) meta = {} if url_match.lower().endswith('.pdf'): meta = await self.get_pdf_metadata(url_match) found_files.append({ 'url': url_match, 'filename': os.path.basename(url_match.split('?')[0]), 'size': size_str, 'metadata': meta, 'download_url': url_match }) # Also check for data-src and data-url attributes (common in lazy-loaded sites) data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') for elem in data_elements: for attr in ['data-src', 'data-url', 'data-href', 'data-download']: try: value = await elem.get_attribute(attr) if value and any(value.lower().endswith(ext) for ext in all_exts): file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) found_files.append({ 'url': file_url, 'filename': os.path.basename(file_url.split('?')[0]), 'size': await self.get_file_size(file_url), 'metadata': {}, 'download_url': file_url }) except: pass # Check script tags for JSON data that might contain file URLs script_elements = soup.find_all('script', type='application/json') for script in script_elements: try: json_data = json.loads(script.string) # Look for URL patterns in the JSON data def extract_urls_from_json(obj, urls_found=None): if urls_found is None: urls_found = [] if isinstance(obj, dict): for k, v in obj.items(): # Check if any key contains url-like terms url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): urls_found.append(v) else: extract_urls_from_json(v, urls_found) elif isinstance(obj, list): for item in obj: extract_urls_from_json(item, urls_found) return urls_found json_urls = extract_urls_from_json(json_data) for json_url in json_urls: if any(json_url.lower().endswith(ext) for ext in all_exts): found_files.append({ 'url': json_url, 'filename': os.path.basename(json_url.split('?')[0]), 'size': await self.get_file_size(json_url), 'metadata': {}, 'download_url': json_url }) except: pass # Check for hidden download buttons or forms hidden_elements = await self.page.evaluate(""" () => { const results = []; // Check for hidden forms with download actions const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); for (const form of forms) { const action = form.getAttribute('action') || ''; results.push({ type: 'form', action: action, inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { return {name: input.name, value: input.value}; }) }); } // Check for hidden download links/buttons const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { const style = window.getComputedStyle(a); return (style.display === 'none' || style.visibility === 'hidden') && (a.href.includes('download') || a.href.includes('file')); }); for (const link of hiddenLinks) { results.push({ type: 'link', href: link.href, text: link.innerText || link.textContent }); } return results; } """) # Process hidden elements for elem in hidden_elements: if elem['type'] == 'link' and 'href' in elem: href = elem['href'] if any(href.lower().endswith(ext) for ext in all_exts): found_files.append({ 'url': href, 'filename': os.path.basename(href.split('?')[0]), 'size': await self.get_file_size(href), 'metadata': {}, 'download_url': href }) # Check for hidden links that might be in JavaScript, iframes, or dynamic content hidden_links = await self.discover_hidden_links(self.page) for link in hidden_links: if any(link.lower().endswith(ext) for ext in all_exts): found_files.append({ 'url': link, 'filename': os.path.basename(link.split('?')[0]), 'size': await self.get_file_size(link), 'metadata': {}, 'download_url': link }) # Deduplicate files by URL seen_urls = set() unique_files = [] for f in found_files: if f['url'] not in seen_urls: seen_urls.add(f['url']) unique_files.append(f) return unique_files except Exception as e: logger.error(f"Error extracting files from {url}: {e}") traceback.print_exc() return [] async def download_file(self, file_info, save_dir, referer): file_url = file_info.get('download_url', file_info['url']) # Use download_url if available fname = file_info['filename'] path = os.path.join(save_dir, fname) base, ext = os.path.splitext(fname) counter = 1 while os.path.exists(path): path = os.path.join(save_dir, f"{base}_{counter}{ext}") counter += 1 os.makedirs(save_dir, exist_ok=True) # Check if we've already downloaded this file if file_url in self.downloaded_files: logger.info(f"File already downloaded: {file_url}") return None try: # Special handling for Google Drive files if "drive.google.com" in file_url or "docs.google.com" in file_url: # Check if it's marked as view-only in metadata is_view_only = file_info.get('metadata', {}).get('view_only', False) # For view-only files, try our most robust approach first if is_view_only: logger.info(f"Attempting to download view-only file: {file_url}") result_path = await self._force_download_viewonly(file_info, path) if result_path: self.downloaded_files.add(file_url) return result_path # If that failed, try the regular download approach logger.info("Primary method failed, trying fallback methods") # Try regular download methods success = await self._download_from_google_drive(file_url, path) if success: self.downloaded_files.add(file_url) return path # If all methods failed for Google Drive, try one last approach logger.warning("All standard methods failed, attempting force download") result_path = await self._force_download_viewonly(file_info, path) if result_path: self.downloaded_files.add(file_url) return result_path if result_path else None # Special handling for complex download URLs if 'Action=downloadfile' in file_url or 'fname=' in file_url: logger.info(f"Using browser download approach for complex URL: {file_url}") # For these URLs, we'll need to navigate to the page and handle the download await self.rotate_proxy_if_needed() async with self.context.new_page() as page: # Set up download event listener download_promise = page.wait_for_event("download") # Navigate to the URL await page.goto(file_url, timeout=60000) # Wait for the download to start try: download = await download_promise await download.save_as(path) if os.path.exists(path) and os.path.getsize(path) > 0: self.downloaded_files.add(file_url) return path except Exception as e: logger.error(f"Browser download failed: {e}") # If download didn't start automatically, try to find and click download buttons download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') for button in download_buttons: try: await button.click() try: download = await download_promise await download.save_as(path) if os.path.exists(path) and os.path.getsize(path) > 0: self.downloaded_files.add(file_url) return path except: pass except: continue # If browser approach failed, try direct request as last resort logger.info("Browser approach failed, trying direct request") # Rotate proxy if needed await self.rotate_proxy_if_needed() # Try with direct requests first (faster) try: headers = { 'User-Agent': get_random_user_agent(), 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': referer, 'DNT': '1' } with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: if response.status_code == 200: # Check content type to verify it's not HTML/error page content_type = response.headers.get('Content-Type', '') if 'text/html' in content_type and not file_url.endswith('.html'): logger.warning(f"Received HTML instead of expected file: {file_url}") else: with open(path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) # Verify file was downloaded correctly if os.path.exists(path) and os.path.getsize(path) > 0: self.downloaded_files.add(file_url) return path except Exception as e: logger.warning(f"Direct download failed: {e}, trying browser approach") # Original code for non-Google Drive downloads using Playwright async with self.context.new_page() as page: headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': referer } # Try to download with timeout protection try: response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) if response.status == 200: content = await response.body() with open(path, 'wb') as f: f.write(content) if os.path.exists(path) and os.path.getsize(path) > 0: self.downloaded_files.add(file_url) return path else: logger.error(f"Download failed with status {response.status}: {file_url}") # Try to extract error information error_info = await response.text() logger.debug(f"Error response: {error_info[:200]}...") # Check if this might be a captcha or login issue if detect_captcha(error_info): logger.warning("Captcha detected during download") # For HF Spaces, we can't implement browser-based captcha solving here # Just log the issue for now except PlaywrightTimeoutError: logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") # Try an alternative approach - using the browser's download manager try: logger.info("Trying browser download manager approach") download_promise = page.wait_for_event("download") await page.goto(file_url, timeout=60000) # Wait for download to start (with timeout) download = await download_promise await download.save_as(path) if os.path.exists(path) and os.path.getsize(path) > 0: self.downloaded_files.add(file_url) return path except Exception as e: logger.error(f"Browser download manager approach failed: {e}") return None except Exception as e: logger.error(f"Error downloading {file_url}: {e}") return None # IMPROVED: Split force_download_viewonly into smaller methods async def _force_download_viewonly(self, file_info, save_path): """Main method to handle view-only files, now simplified""" # Extract the file ID file_id = self._extract_drive_file_id(file_info) if not file_id: logger.error("Could not extract file ID") return None # Get file type information file_type = file_info.get('metadata', {}).get('file_type', 'pdf') base, ext = os.path.splitext(save_path) if not ext: save_path = f"{base}.{file_type}" logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") # Create a stealth browser for handling the download browser = await self._create_stealth_browser() try: # Set up the browser page page = await browser.new_page() # Go to the file view page logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) await page.wait_for_load_state('networkidle') # Check for permission issues content = await page.content() if "the owner has not granted you permission to" in content: logger.warning("Permission denied error detected") return None # Wait for the page to stabilize await page.wait_for_timeout(random.randint(3000, 7000)) # Create temp directory for working files temp_dir = tempfile.mkdtemp() # Handle different file types if file_type.lower() == 'pdf': return await self._download_viewonly_pdf(page, file_id, save_path, temp_dir) else: return await self._download_viewonly_other(page, file_id, file_type, save_path, temp_dir) except Exception as e: logger.error(f"Error during force download: {e}") return None finally: await browser.close() def _extract_drive_file_id(self, file_info): """Extract Google Drive file ID from file info""" # Try to get file ID from metadata file_id = file_info.get('metadata', {}).get('file_id') if file_id: return file_id # If not in metadata, try to extract from URL url = file_info.get('url', '') for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: match = re.search(pattern, url) if match: return match.group(1) return None async def _create_stealth_browser(self): """Create a stealth browser instance for handling sensitive downloads""" browser_args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', '--disable-site-isolation-trials', '--disable-blink-features=AutomationControlled' # Anti-detection ] browser = await self.playwright.chromium.launch( headless=True, args=browser_args ) # Use higher resolution for better quality context = await browser.new_context( viewport={'width': 1600, 'height': 1200}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", device_scale_factor=2.0, accept_downloads=True # Critical for the download workflow ) # Add anti-detection script await context.add_init_script(""" () => { Object.defineProperty(navigator, 'webdriver', { get: () => false, }); // Change plugins Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5].map(() => ({ lengthComputable: true, loaded: 100, total: 100 })) }); // Handle languages Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en', 'es'] }); // Modify hardware concurrency Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 }); } """) return browser async def _download_viewonly_pdf(self, page, file_id, save_path, temp_dir): """Handle downloading view-only PDF files""" try: # Estimate number of pages estimated_pages = await page.evaluate(""" () => { // Method 1: Check page counter text const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { const text = el.textContent || ''; return /\\d+\\s*\\/\\s*\\d+/.test(text); }); if (pageCounters.length > 0) { const text = pageCounters[0].textContent || ''; const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); if (match && match[2]) return parseInt(match[2]); } // Method 2: Check actual page elements const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); if (pageElements.length > 0) return pageElements.length; // Method 3: Look for page thumbnails const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); if (thumbnails.length > 0) return thumbnails.length; // Fallback: conservative guess return 50; } """) logger.info(f"Estimated {estimated_pages} pages in PDF") # Initial scroll to trigger lazy loading logger.info("Initial scroll to bottom to trigger lazy loading...") await page.keyboard.press("End") await page.wait_for_timeout(3000) # Scroll page by page to ensure all pages are loaded logger.info("Scrolling page by page...") max_attempts = min(estimated_pages * 3, 300) attempt = 0 prev_blob_count = 0 while attempt < max_attempts: blob_count = await page.evaluate(""" Array.from(document.getElementsByTagName('img')) .filter(img => img.src.startsWith('blob:') && img.width > 100) .length """) logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): logger.info("All pages appear to be loaded.") break # Alternate between PageDown and End keys for more natural scrolling if attempt % 3 == 0: await page.keyboard.press("End") else: await page.keyboard.press("PageDown") # Randomized wait times await page.wait_for_timeout(random.randint(1500, 3000)) # Move mouse randomly to appear more human-like if attempt % 4 == 0: await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) prev_blob_count = blob_count attempt += 1 # Extra wait to ensure everything is loaded await page.wait_for_timeout(5000) # Set up download event listener for the PDF download_promise = page.wait_for_event("download") # Use jsPDF to generate PDF from loaded pages logger.info("Generating PDF from loaded pages...") result = await page.evaluate(r''' (function() { return new Promise((resolve, reject) => { let script = document.createElement("script"); script.onload = function () { try { let pdf = new jsPDF(); let imgs = Array.from(document.getElementsByTagName("img")) .filter(img => img.src.startsWith('blob:') && img.width > 100) .sort((a, b) => { const rectA = a.getBoundingClientRect(); const rectB = b.getBoundingClientRect(); return rectA.top - rectB.top; }); console.log(`Found ${imgs.length} valid page images to add to PDF`); let added = 0; for (let i = 0; i < imgs.length; i++) { let img = imgs[i]; let canvas = document.createElement("canvas"); let ctx = canvas.getContext("2d"); canvas.width = img.width; canvas.height = img.height; ctx.drawImage(img, 0, 0, img.width, img.height); let imgData = canvas.toDataURL("image/jpeg", 1.0); if (added > 0) { pdf.addPage(); } pdf.addImage(imgData, 'JPEG', 0, 0); added++; } pdf.save("download.pdf"); resolve({success: true, pageCount: added}); } catch (error) { reject({success: false, error: error.toString()}); } }; script.onerror = function() { reject({success: false, error: "Failed to load jsPDF library"}); }; script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; document.body.appendChild(script); }); })(); ''') if not result.get('success', False): logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") # Try fallback approach - screenshot method logger.info("Trying fallback screenshot method...") return await self._pdf_screenshot_fallback(page, estimated_pages, save_path, temp_dir) logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") # Wait for the download and save it download = await download_promise await download.save_as(save_path) # Clean up temp directory try: os.rmdir(temp_dir) except: pass # Verify file exists and has content if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: logger.info(f"Successfully downloaded PDF to {save_path}") return save_path else: logger.error(f"Generated file is too small or missing: {save_path}") return None except Exception as e: logger.error(f"Error in PDF download: {e}") return None async def _pdf_screenshot_fallback(self, page, estimated_pages, save_path, temp_dir): """Fallback method using screenshots for PDF creation""" try: # Navigate back to the first page await page.evaluate(""" () => { // Find and click the "first page" button if available const buttons = Array.from(document.querySelectorAll('button')); const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); if (firstPageBtn) firstPageBtn.click(); } """) await page.wait_for_timeout(1000); # Create a PDF by taking screenshots of each page screenshots = [] current_page = 1 max_pages = estimated_pages # Create a PDF using the reportlab package while current_page <= max_pages: screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") # Try to find the current page element page_elem = await page.query_selector('.drive-viewer-paginated-page') if page_elem: await page_elem.screenshot(path=screenshot_path) else: # Fallback to full page screenshot await page.screenshot(path=screenshot_path) screenshots.append(screenshot_path) # Try to navigate to next page next_btn = await page.query_selector('button[aria-label="Next page"]') if next_btn: is_disabled = await next_btn.get_attribute('disabled') if is_disabled: logger.info(f"Reached end of document at page {current_page}") break await next_btn.click() await page.wait_for_timeout(1000) current_page += 1 else: break # Create PDF from screenshots if screenshots: first_img = Image.open(screenshots[0]) width, height = first_img.size c = canvas.Canvas(save_path, pagesize=(width, height)) for screenshot in screenshots: img = Image.open(screenshot) c.drawImage(screenshot, 0, 0, width, height) c.showPage() c.save() # Clean up screenshots for screenshot in screenshots: os.remove(screenshot) return save_path return None except Exception as e: logger.error(f"Error in screenshot fallback: {e}") return None async def _download_viewonly_other(self, page, file_id, file_type, save_path, temp_dir): """Handle downloading non-PDF view-only files""" try: # Take a screenshot of the file screenshot_path = os.path.join(temp_dir, "file.png") await page.screenshot(path=screenshot_path) if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: # For document types, try to export directly success = await self._export_google_doc(file_id, file_type, save_path) if success: os.remove(screenshot_path) return save_path # If export fails, fall back to screenshot logger.warning(f"Export failed, falling back to screenshot for {file_type}") # For other types or if export failed, save the screenshot with appropriate extension shutil.copy(screenshot_path, save_path) os.remove(screenshot_path) return save_path if os.path.exists(save_path) else None except Exception as e: logger.error(f"Error in non-PDF download: {e}") return None async def _download_from_google_drive(self, url, save_path): """Enhanced method to download from Google Drive with multiple fallback approaches""" # Extract the file ID from different URL formats file_id = self._extract_drive_file_id({"url": url}) if not file_id: logger.error(f"Could not extract file ID from URL: {url}") return False # Determine file type first (important for handling different file types) file_type, is_view_only = await self._get_google_drive_file_info(file_id) logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") base, ext = os.path.splitext(save_path) if not ext and file_type: # Add the correct extension if missing save_path = f"{base}.{file_type}" # For view-only files, use specialized approaches if is_view_only: # Approach 1: For PDFs, use the JS method if file_type == 'pdf': success = await self._download_viewonly_pdf_with_js(file_id, save_path) if success: return True # Approach 2: For Google Docs, Sheets, etc., use export API if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: success = await self._export_google_doc(file_id, file_type, save_path) if success: return True # Fallback to the main view-only method result_path = await self._force_download_viewonly({ 'url': url, 'metadata': {'file_id': file_id, 'file_type': file_type, 'view_only': True} }, save_path) return bool(result_path) # Try standard approaches for non-view-only files try: # Try direct download link first (fastest) direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" # Add anti-bot headers headers = { 'User-Agent': get_random_user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://drive.google.com/', 'DNT': '1' } # Try with streaming to handle larger files with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: if r.status_code == 200: # Check if we got HTML instead of the file content_type = r.headers.get('Content-Type', '') if 'text/html' in content_type and not file_id.endswith('.html'): logger.warning("Received HTML instead of file, trying with session cookies") else: # Looks like we got the actual file with open(save_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) # Verify file exists and has content if os.path.exists(save_path) and os.path.getsize(save_path) > 0: logger.info("Direct download successful") return True # Try browser-based approach as last resort try: async with self.context.new_page() as page: # Visit the file view page first to get cookies await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) await page.wait_for_timeout(3000) # Set up download event listener download_promise = page.wait_for_event("download") # Try to trigger the download button click download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') if download_button: await download_button.click() # Wait for download to start try: download = await download_promise await download.save_as(save_path) return os.path.exists(save_path) and os.path.getsize(save_path) > 0 except Exception as e: logger.error(f"Error during browser download: {e}") return False else: # Try the export download URL await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) # Look for and click any download buttons or links download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') for elem in download_elements: try: await elem.click() # Wait a bit to see if download starts try: download = await download_promise await download.save_as(save_path) return os.path.exists(save_path) and os.path.getsize(save_path) > 0 except: pass except: continue except Exception as e: logger.error(f"Browser-based download attempt failed: {e}") logger.warning("All standard download methods failed") return False except Exception as e: logger.error(f"Error in Google Drive download: {e}") return False async def _download_viewonly_pdf_with_js(self, file_id, save_path): """Download view-only PDF using blob images and JS""" try: # Create a dedicated browser instance browser = await self._create_stealth_browser() page = await browser.new_page() try: # Navigate to the file with human-like behavior logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) await page.wait_for_load_state('networkidle') # Perform human-like interactions await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) await page.wait_for_timeout(random.randint(2000, 5000)) # Estimate the number of pages estimated_pages = await page.evaluate(""" () => { // Look for page counter in the interface const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { const text = el.textContent || ''; return /\\d+\\s*\\/\\s*\\d+/.test(text); }); if (pageCounters.length > 0) { const text = pageCounters[0].textContent || ''; const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); if (match && match[2]) return parseInt(match[2]); } // If we can't find a counter, check actual pages const pages = document.querySelectorAll('.drive-viewer-paginated-page'); if (pages.length > 0) return pages.length; // Default to a reasonable number if we can't determine return 50; } """) logger.info(f"Estimated number of pages: {estimated_pages}") # Initial scroll to trigger loading logger.info("Initial scroll to bottom to trigger lazy loading...") await page.keyboard.press("End") await page.wait_for_timeout(3000) # Scroll through document with variety to appear natural await self._natural_scroll_through_document(page, estimated_pages) # Set up download event listener download_promise = page.wait_for_event("download") # Use jsPDF to generate PDF from loaded pages logger.info("Generating PDF from loaded pages...") result = await page.evaluate(r''' (function() { return new Promise((resolve, reject) => { let script = document.createElement("script"); script.onload = function () { try { let pdf = new jsPDF(); let imgs = Array.from(document.getElementsByTagName("img")) .filter(img => img.src.startsWith('blob:') && img.width > 100) .sort((a, b) => { const rectA = a.getBoundingClientRect(); const rectB = b.getBoundingClientRect(); return rectA.top - rectB.top; }); console.log(`Found ${imgs.length} valid page images to add to PDF`); let added = 0; for (let i = 0; i < imgs.length; i++) { let img = imgs[i]; let canvas = document.createElement("canvas"); let ctx = canvas.getContext("2d"); canvas.width = img.width; canvas.height = img.height; ctx.drawImage(img, 0, 0, img.width, img.height); let imgData = canvas.toDataURL("image/jpeg", 1.0); if (added > 0) { pdf.addPage(); } pdf.addImage(imgData, 'JPEG', 0, 0); added++; } pdf.save("download.pdf"); resolve({success: true, pageCount: added}); } catch (error) { reject({success: false, error: error.toString()}); } }; script.onerror = function() { reject({success: false, error: "Failed to load jsPDF library"}); }; script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; document.body.appendChild(script); }); })(); ''') if not result.get('success'): logger.error(f"Error in PDF generation: {result.get('error')}") return False logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") # Wait for the download to complete and save the file download = await download_promise # Save the downloaded file to the specified path await download.save_as(save_path) logger.info(f"Successfully saved PDF to {save_path}") return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 finally: await browser.close() except Exception as e: logger.error(f"Error in viewonly PDF download process: {e}") return False async def _natural_scroll_through_document(self, page, estimated_pages): """Scroll through document in a natural way to load all pages""" logger.info("Scrolling through document to load all pages...") max_attempts = min(estimated_pages * 3, 300) attempt = 0 prev_blob_count = 0 consecutive_same_count = 0 while attempt < max_attempts: # Count blob images (which are the PDF pages) blob_count = await page.evaluate(""" Array.from(document.getElementsByTagName('img')) .filter(img => img.src.startsWith('blob:') && img.width > 100) .length """) logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") # Check if we've loaded all pages or if we're stuck if blob_count >= estimated_pages: logger.info(f"All {estimated_pages} pages appear to be loaded.") break if blob_count == prev_blob_count: consecutive_same_count += 1 if consecutive_same_count >= 5 and blob_count > 0: logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") break else: consecutive_same_count = 0 # Mix up the scrolling approach for more human-like behavior scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) if scroll_action == "PageDown": await page.keyboard.press("PageDown") elif scroll_action == "End": await page.keyboard.press("End") elif scroll_action == "ArrowDown": # Press arrow down multiple times for _ in range(random.randint(5, 15)): await page.keyboard.press("ArrowDown") await page.wait_for_timeout(random.randint(50, 150)) else: # mouse # Scroll using mouse wheel current_y = random.randint(300, 700) await page.mouse.move(x=random.randint(300, 800), y=current_y) await page.mouse.wheel(0, random.randint(300, 800)) # Random wait between scrolls await page.wait_for_timeout(random.randint(1000, 3000)) prev_blob_count = blob_count attempt += 1 # Extra wait to ensure everything is fully loaded await page.wait_for_timeout(5000) async def _export_google_doc(self, file_id, file_type, save_path): """Export Google Docs/Sheets/Slides to downloadable formats""" try: # Map file types to export formats export_urls = { 'doc': f"https://docs.google.com/document/d/{file_id}/export?format=doc", 'docx': f"https://docs.google.com/document/d/{file_id}/export?format=docx", 'sheet': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", 'xlsx': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", 'ppt': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", 'pptx': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", 'pdf': f"https://docs.google.com/document/d/{file_id}/export?format=pdf" } export_url = export_urls.get(file_type, f"https://docs.google.com/document/d/{file_id}/export?format=pdf") async with self.context.new_page() as page: # Get cookies from the main view page first await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') # Now try the export response = await page.goto(export_url, wait_until='networkidle') if response.status == 200: content = await response.body() with open(save_path, 'wb') as f: f.write(content) return os.path.exists(save_path) and os.path.getsize(save_path) > 0 else: logger.warning(f"Export failed with status {response.status}") return False except Exception as e: logger.error(f"Error exporting Google Doc: {e}") return False async def _get_google_drive_file_info(self, file_id): """Get file type and view-only status from Google Drive""" file_type = None is_view_only = False try: async with self.context.new_page() as page: await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) # Check if view-only view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') is_view_only = view_only_text is not None # Check for Google Docs viewer gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') if gdocs_viewer: file_type = 'docx' elif gsheets_viewer: file_type = 'xlsx' elif gslides_viewer: file_type = 'pptx' else: # Check for PDF viewer pdf_viewer = await page.query_selector('embed[type="application/pdf"]') if pdf_viewer: file_type = 'pdf' else: # Check for image viewer img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') if img_viewer: # Get image type from src img_src = await img_viewer.get_attribute('src') if 'jpg' in img_src or 'jpeg' in img_src: file_type = 'jpg' elif 'png' in img_src: file_type = 'png' else: file_type = 'jpg' # Default to jpg else: # Generic file type fallback file_type = 'pdf' # Default to PDF # If still no type, check filename if not file_type: title_element = await page.query_selector('div[role="heading"]') if title_element: title = await title_element.text_content() if title: ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) if ext_match: file_type = ext_match.group(1).lower() except Exception as e: logger.error(f"Error getting Google Drive file info: {e}") file_type = 'pdf' # Default to PDF if we can't determine return file_type, is_view_only # IMPROVED: Enhanced sublink extraction method async def get_sublinks(self, url, limit=10000): """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" links = set() try: logger.info(f"Fetching sublinks from: {url}") # Check if this is a direct download link if is_download_link(url): logger.info(f"URL appears to be a direct download link: {url}") links.add(url) return list(links)[:limit] # Skip if we've already visited this URL normalized_url = normalize_download_url(url) if normalized_url in self.visited_urls: logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") return list(links)[:limit] # Add to visited URLs self.visited_urls.add(normalized_url) # Special handling for educational sites like phsms.cloud.ncnu.edu.tw if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in ["exam", "test", "pastpaper", "eduexp"]): logger.info("Using specialized exam site sublink extraction") edu_links = await self.get_edu_exam_links(url) for link in edu_links: links.add(link) # If we found a good number of links with the specialized method, return them if len(links) > 5: logger.info(f"Found {len(links)} sublinks with specialized method") return list(links)[:limit] # Rotate proxy if needed await self.rotate_proxy_if_needed() # Standard sublink extraction for all sites try: await self.page.goto(url, timeout=30000, wait_until='networkidle') except Exception as e: logger.warning(f"Error navigating to URL for sublink extraction: {e}") # Continue with what we have, we'll try to extract links anyway # Get base URL for resolving relative links parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" path_base = os.path.dirname(parsed_base.path) # Perform initial scrolling to load lazy content await self.page.evaluate(""" async () => { const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); const height = document.body.scrollHeight; const step = Math.floor(window.innerHeight / 2); for (let i = 0; i < height; i += step) { window.scrollTo(0, i); await delay(150); } window.scrollTo(0, 0); } """) await self.page.wait_for_timeout(1000) # Check if page has ASP.NET elements which might need special handling is_aspnet = await self.page.evaluate(''' () => { return document.querySelector('form#aspnetForm') !== null || document.querySelector('input[name="__VIEWSTATE"]') !== null; } ''') if is_aspnet: logger.info("Detected ASP.NET page, using enhanced extraction method") # Try to interact with ASP.NET controls that might reveal more links # Look for dropdowns, buttons, and grid elements dropdowns = await self.page.query_selector_all('select') buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') # Try interacting with dropdowns first for dropdown in dropdowns: try: # Get all options options = await self.page.evaluate(''' (dropdown) => { return Array.from(dropdown.options).map(o => o.value); } ''', dropdown) # Try selecting each option for option in options: if option: await dropdown.select_option(value=option) await self.page.wait_for_timeout(1000) await self.page.wait_for_load_state('networkidle', timeout=5000) # Extract any new links that appeared await self.extract_all_link_types(links, base_url, path_base) except Exception as e: logger.warning(f"Error interacting with dropdown: {e}") # Try clicking buttons (but avoid dangerous ones like "delete") safe_buttons = [] for button in buttons: button_text = await button.text_content() or "" button_value = await button.get_attribute("value") or "" button_id = await button.get_attribute("id") or "" combined_text = (button_text + button_value + button_id).lower() # Skip potentially destructive buttons if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): continue # Prioritize buttons that might show more content if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): safe_buttons.append(button) # Click the safe buttons for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks try: await button.click() await self.page.wait_for_timeout(1000) await self.page.wait_for_load_state('networkidle', timeout=5000) # Extract any new links that appeared await self.extract_all_link_types(links, base_url, path_base) except Exception as e: logger.warning(f"Error clicking button: {e}") # Extract links from the initial page state await self.extract_all_link_types(links, base_url, path_base) # Look specifically for links inside grid/table views which are common in ASP.NET applications grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') for cell in grid_cells: try: href = await cell.get_attribute('href') if href: full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) links.add(full_url) except Exception as e: logger.warning(f"Error extracting grid link: {e}") # Extract links from onclick attributes and javascript:__doPostBack calls postback_links = await self.page.evaluate(''' () => { const results = []; // Find elements with onclick containing __doPostBack const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); for (const el of elements) { // Extract the postback target const onclick = el.getAttribute('onclick') || ''; const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); if (match && match[1]) { // Get the visible text to use as description const text = el.innerText || el.textContent || 'Link'; results.push({ id: match[1], text: text.trim() }); } } return results; } ''') # Try interacting with some of the postback links for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions try: logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") await self.page.evaluate(f''' () => {{ if (typeof __doPostBack === 'function') {{ __doPostBack('{postback["id"]}', ''); }} }} ''') await self.page.wait_for_timeout(1500) await self.page.wait_for_load_state('networkidle', timeout=5000) # Extract any new links that appeared await self.extract_all_link_types(links, base_url, path_base) except Exception as e: logger.warning(f"Error with postback: {e}") # Look for pagination controls and try to navigate through them pagination_elements = await self.page.query_selector_all( 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' ) # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops) for i in range(min(5, len(pagination_elements))): try: # Focus on elements that look like "next page" buttons el = pagination_elements[i] el_text = await el.text_content() or "" # Only click if this looks like a pagination control if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip(): logger.info(f"Clicking pagination control: {el_text}") await el.click() await self.page.wait_for_timeout(2000) await self.page.wait_for_load_state('networkidle', timeout=5000) # Get new links from this page await self.extract_all_link_types(links, base_url, path_base) except Exception as e: logger.warning(f"Error clicking pagination: {e}") # Check for hidden links that might be revealed by JavaScript hidden_links = await self.page.evaluate(""" () => { // Try to execute common JavaScript patterns that reveal hidden content try { // Common patterns used in websites to initially hide content const hiddenContainers = document.querySelectorAll( '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' ); // Attempt to make them visible hiddenContainers.forEach(el => { el.style.display = 'block'; el.style.visibility = 'visible'; el.classList.remove('hidden', 'hide'); }); // Return any newly visible links return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); } catch (e) { return []; } } """) # Add any newly discovered links for href in hidden_links: if href and not href.startswith('javascript:'): links.add(href) # Find all download links download_links = await self.page.evaluate(""" () => { return Array.from(document.querySelectorAll('a[href]')) .filter(a => { const href = a.href.toLowerCase(); return href.includes('download') || href.includes('file') || href.includes('get') || href.includes('view.php') || href.includes('action=') || href.includes('fname='); }) .map(a => a.href); } """) for download_link in download_links: links.add(download_link) # Also check for hidden links in JavaScript, iframes, or dynamic content js_links = await self.discover_hidden_links(self.page) for link in js_links: links.add(link) logger.info(f"Found {len(links)} sublinks") # Prioritize download links prioritized_links = [] normal_links = [] for link in links: if is_download_link(link): prioritized_links.append(link) else: normal_links.append(link) # Return prioritized links first, then normal links, up to the limit result = prioritized_links + normal_links return result[:limit] except Exception as e: logger.error(f"Error getting sublinks from {url}: {e}") return list(links)[:limit] # Return what we have so far async def extract_all_link_types(self, links_set, base_url, path_base): """Extract all types of links from the current page""" # Get all tag links a_links = await self.page.query_selector_all('a[href]') for a in a_links: try: href = await a.get_attribute('href') if href and not href.startswith('javascript:') and not href.startswith('#'): full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) links_set.add(full_url) except Exception: pass # Get iframe sources iframes = await self.page.query_selector_all('iframe[src]') for iframe in iframes: try: src = await iframe.get_attribute('src') if src and not src.startswith('javascript:') and not src.startswith('about:'): full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) links_set.add(full_url) except Exception: pass # Get links from onclick attributes that reference URLs onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') for el in onclick_elements: try: onclick = await el.get_attribute('onclick') urls = re.findall(r'(https?://[^\'"]+)', onclick) for url in urls: links_set.add(url) except Exception: pass # Look for URLs in data-* attributes data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') for el in data_elements: for attr in ['data-url', 'data-href', 'data-src']: try: value = await el.get_attribute(attr) if value and not value.startswith('javascript:'): full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) links_set.add(full_url) except Exception: pass # Look for special anchor links that might not have href attributes special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') for anchor in special_anchors: try: href = await anchor.get_attribute('href') if href and not href.startswith('javascript:') and not href.startswith('#'): full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) links_set.add(full_url) except Exception: pass # Extract links from JSON data embedded in the page script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') for script in script_elements: try: script_content = await script.text_content() if script_content: # Look for URLs in the JSON content urls = re.findall(r'(https?://[^\'"]+)', script_content) for url in urls: links_set.add(url) except Exception: pass def resolve_relative_url(self, relative_url, base_url, path_base): """Properly resolve relative URLs considering multiple formats""" if relative_url.startswith('/'): # Absolute path relative to domain return f"{base_url}{relative_url}" elif relative_url.startswith('./'): # Explicit relative path return f"{base_url}{path_base}/{relative_url[2:]}" elif relative_url.startswith('../'): # Parent directory parent_path = '/'.join(path_base.split('/')[:-1]) return f"{base_url}{parent_path}/{relative_url[3:]}" else: # Regular relative path return f"{base_url}{path_base}/{relative_url}" async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): """Perform a deep search for files at the URL and its sublinks""" import streamlit as st if not custom_ext_list: custom_ext_list = [] progress_text = st.empty() progress_bar = st.progress(0) file_count_text = st.empty() try: # Reset the visited URLs for a fresh deep search self.visited_urls = set() progress_text.text("🔍 Analyzing main page...") # Special handling for ASP.NET pages is_aspnet = False try: await self.page.goto(url, timeout=30000, wait_until='networkidle') is_aspnet = await self.page.evaluate(''' () => { return document.querySelector('form#aspnetForm') !== null || document.querySelector('input[name="__VIEWSTATE"]') !== null; } ''') except Exception: pass # Check if this URL is a direct download if is_download_link(url): progress_text.text("📥 URL appears to be a direct download. Processing...") # Try to extract file directly normalized_url = normalize_download_url(url) file_info = { 'url': normalized_url, 'download_url': normalized_url, 'filename': os.path.basename(urlparse(normalized_url).path) or 'download', 'size': 'Unknown Size', 'metadata': {} } # Add to visited URLs self.visited_urls.add(normalized_url) progress_bar.progress(1.0) return [file_info] # Extract files from main page progress_text.text("📄 Extracting files from main page...") main_files = await self.extract_downloadable_files(url, custom_ext_list) initial_count = len(main_files) file_count_text.text(f"Found {initial_count} files on main page") # Get sublinks with enhanced method progress_text.text("🔗 Getting sublinks...") sublinks = await self.get_sublinks(url, sublink_limit) total_links = len(sublinks) progress_text.text(f"Found {total_links} sublinks to process") # Always include files from the main page, regardless of sublinks all_files = main_files if not sublinks: progress_bar.progress(1.0) return all_files # Process each sublink for i, sublink in enumerate(sublinks, 1): progress = i / total_links progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") progress_bar.progress(progress) try: # Check if this is a direct download link if is_download_link(sublink): # For download links, just add the link directly normalized_url = normalize_download_url(sublink) # Skip if already visited if normalized_url in self.visited_urls: continue # Mark as visited self.visited_urls.add(normalized_url) # Get file size if possible size_str = await self.get_file_size(normalized_url) # Get filename, with fallback to domain-based name filename = os.path.basename(urlparse(normalized_url).path) if not filename or filename == '/' or '?' in filename: domain = get_domain(normalized_url) ext = '.pdf' # Default extension for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: if common_ext in normalized_url.lower(): ext = common_ext break filename = f"file_from_{domain}{ext}" # Add file to results all_files.append({ 'url': normalized_url, 'download_url': normalized_url, 'filename': filename, 'size': size_str, 'metadata': {} }) file_count_text.text(f"Found {len(all_files)} total files") continue # For regular links, use a longer timeout for ASP.NET pages which can be slower sub_timeout = timeout * 2 if is_aspnet else timeout # Skip already visited URLs if sublink in self.visited_urls: continue # Extract files from sublink sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) all_files.extend(sub_files) file_count_text.text(f"Found {len(all_files)} total files") except Exception as e: logger.warning(f"Error processing sublink {sublink}: {e}") # Deduplicate files seen_urls = set() unique_files = [] for f in all_files: if f['url'] not in seen_urls: seen_urls.add(f['url']) unique_files.append(f) final_count = len(unique_files) progress_text.text(f"✅ Deep search complete!") file_count_text.text(f"Found {final_count} unique files") progress_bar.progress(1.0) return unique_files except Exception as e: logger.error(f"Deep search error: {e}") progress_text.text(f"⚠️ Error during deep search: {str(e)}") return [] finally: await asyncio.sleep(2) if not st.session_state.get('keep_progress', False): progress_text.empty() progress_bar.empty()