diff --git "a/download_manager.py" "b/download_manager.py" deleted file mode 100644--- "a/download_manager.py" +++ /dev/null @@ -1,2932 +0,0 @@ -import os -import re -import random -import asyncio -import logging -import traceback -import tempfile -import shutil -import json -import time -from urllib.parse import urlparse, urljoin, unquote, parse_qs -from io import BytesIO -from bs4 import BeautifulSoup -import PyPDF2 -import requests -from PIL import Image -from reportlab.lib.pagesizes import letter -from reportlab.pdfgen import canvas -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError - -from app.utils import ( - get_random_user_agent, sizeof_fmt, get_domain, is_download_link, - normalize_download_url, detect_captcha, USER_AGENTS, STEALTH_SETTINGS, - PROXY_ROTATION_CONFIG -) - -logger = logging.getLogger(__name__) - -class DownloadManager: - def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): - self.use_proxy = use_proxy - self.proxy = proxy - self.query = query - self.num_results = num_results - self.playwright = None - self.browser = None - self.context = None - self.page = None - self.use_stealth = use_stealth - self.proxy_rotation = proxy_rotation - self.request_count = 0 - self.captcha_detected = False - self.download_timeout = 300 # 5 minutes timeout for downloads - # Track visited URLs to avoid revisiting the same URL multiple times - self.visited_urls = set() - # Track successfully downloaded files to avoid redownloading - self.downloaded_files = set() - - async def __aenter__(self): - self.playwright = await async_playwright().start() - - # Prepare browser args with stealth settings - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--no-zygote', - '--single-process', - '--disable-web-security', - '--disable-features=IsolateOrigins', - '--disable-site-isolation-trials' - ] - - # Add stealth-specific args - if self.use_stealth: - browser_args.extend([ - '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-webgl', - '--disable-webrtc' - ]) - - # Setup browser options - opts = { - "headless": True, - "args": browser_args - } - - # Configure proxy if specified - if self.use_proxy and self.proxy: - opts["proxy"] = {"server": self.proxy} - - # Launch browser with options - self.browser = await self.playwright.chromium.launch(**opts) - - # Setup browser context with enhanced settings - context_opts = { - "user_agent": get_random_user_agent(), - "viewport": {"width": 1920, "height": 1080}, - "device_scale_factor": 1, - "has_touch": False, - "is_mobile": False, - "ignore_https_errors": True, - "accept_downloads": True - } - - # Apply stealth-specific settings to the context - if self.use_stealth: - # Apply JS-injection for enhanced stealth - context_opts["bypass_csp"] = True - self.context = await self.browser.new_context(**context_opts) - - # Execute stealth JS to avoid detection - await self.context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change navigator properties - const newProto = navigator.__proto__; - delete newProto.webdriver; - - // Overwrite the plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - // Handle languages more naturally - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - - // Modify deviceMemory - Object.defineProperty(navigator, 'deviceMemory', { - get: () => 8 - }); - - // WebGL modifications - const getParameter = WebGLRenderingContext.prototype.getParameter; - WebGLRenderingContext.prototype.getParameter = function(parameter) { - if (parameter === 37445) { - return 'Intel Inc.'; - } - if (parameter === 37446) { - return 'Intel Iris OpenGL Engine'; - } - return getParameter.apply(this, arguments); - }; - } - """) - else: - # Regular context without stealth - self.context = await self.browser.new_context(**context_opts) - - # Create page with enhanced headers - self.page = await self.context.new_page() - await self.page.set_extra_http_headers({ - 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Cache-Control': 'max-age=0', - 'DNT': '1', # Do Not Track - 'Referer': 'https://www.google.com/', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'cross-site', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1' - }) - - # Add delay for mouse movements to simulate human behavior - if self.use_stealth: - await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) - await self.page.wait_for_timeout(random.randint(200, 500)) - - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - if self.browser: - await self.browser.close() - if self.playwright: - await self.playwright.stop() - - async def rotate_proxy_if_needed(self): - """Rotate proxy if proxy rotation is enabled and threshold is reached""" - if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: - self.request_count += 1 - if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: - # Get next proxy from the pool - next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) - PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list - - # Close existing context and create new one with the new proxy - if self.context: - await self.context.close() - - # Create new context with the new proxy - context_opts = { - "user_agent": get_random_user_agent(), - "proxy": {"server": next_proxy}, - "accept_downloads": True - } - self.context = await self.browser.new_context(**context_opts) - self.page = await self.context.new_page() - - # Reset counter - self.request_count = 0 - logger.info(f"Rotated to new proxy: {next_proxy}") - - async def handle_captcha(self, page): - """Detect and handle captchas if possible""" - # Check for common captcha patterns - content = await page.content() - if detect_captcha(content): - self.captcha_detected = True - logger.warning("Captcha detected on page") - - # Strategies for handling captchas: - # 1. For simple captchas, try to extract the image and solve it - captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') - if captcha_img: - logger.info("Found captcha image, attempting to capture") - - # Take screenshot of the captcha - captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") - await captcha_img.screenshot(path=captcha_path) - - # In a real implementation, you would send this to a captcha solving service - # For now, just log the detection - logger.info(f"Captcha image saved to {captcha_path}") - - # For demonstration, we'll notify the user but not actually solve it - return False - - # 2. For reCAPTCHA, special handling would be required - recaptcha = await page.query_selector('iframe[src*="recaptcha"]') - if recaptcha: - logger.warning("reCAPTCHA detected, would require external solving service") - return False - - # 3. Try to perform human-like actions that might bypass simple bot checks - await self.perform_human_actions(page) - - # Check if captcha is still present - content = await page.content() - if detect_captcha(content): - logger.warning("Captcha still present after human-like actions") - return False - else: - logger.info("Captcha appears to be resolved") - return True - - return True # No captcha detected - - async def perform_human_actions(self, page): - """Perform human-like actions on the page to possibly bypass simple bot checks""" - try: - # 1. Slowly scroll down the page - for i in range(3): - await page.evaluate(f"window.scrollTo(0, {i * 300})") - await page.wait_for_timeout(random.randint(300, 700)) - - # 2. Random mouse movements - for _ in range(3): - x = random.randint(100, 800) - y = random.randint(100, 600) - await page.mouse.move(x=x, y=y) - await page.wait_for_timeout(random.randint(200, 500)) - - # 3. Click on a non-essential part of the page - try: - await page.click("body", position={"x": 50, "y": 50}) - except: - pass - - # 4. Wait a bit before continuing - await page.wait_for_timeout(1000) - - except Exception as e: - logger.warning(f"Error during human-like actions: {e}") - - async def search_bing(self): - urls = [] - try: - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - search_url = f"https://www.bing.com/search?q={self.query}" - await self.page.goto(search_url, timeout=30000) - await self.page.wait_for_load_state('networkidle') - - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected during search, results may be limited") - - # More natural scrolling behavior - for i in range(3): - await self.page.evaluate(f"window.scrollTo(0, {i * 400})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Extract search results - links = await self.page.query_selector_all("li.b_algo h2 a") - for link in links[:self.num_results]: - href = await link.get_attribute('href') - if href: - urls.append(href) - - # If we didn't find enough results, try an alternative selector - if len(urls) < self.num_results: - alt_links = await self.page.query_selector_all(".b_caption a") - for link in alt_links: - href = await link.get_attribute('href') - if href and href not in urls: - urls.append(href) - if len(urls) >= self.num_results: - break - - return urls - except Exception as e: - logger.error(f"Error searching Bing: {e}") - return [] - - async def get_file_size(self, url): - try: - await self.rotate_proxy_if_needed() - - # For complex download URLs, we need to be careful with HEAD requests - if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: - # For these URLs, we'll try a more reliable approach using range headers - headers = { - 'User-Agent': get_random_user_agent(), - 'Range': 'bytes=0-0' # Just request the first byte to check headers - } - - try: - with requests.get(url, headers=headers, stream=True, timeout=10) as r: - if 'Content-Range' in r.headers: - content_range = r.headers['Content-Range'] - match = re.search(r'bytes 0-0/(\d+)', content_range) - if match: - size = int(match.group(1)) - return sizeof_fmt(size) - - if 'Content-Length' in r.headers: - size = int(r.headers['Content-Length']) - # If size is 1, it's likely just our single requested byte - if size > 1: - return sizeof_fmt(size) - except Exception as e: - logger.warning(f"Error getting file size with Range request: {e}") - - # Fallback to browser approach - try: - async with self.context.new_page() as page: - response = await page.request.head(url, timeout=15000) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - except Exception as e: - logger.warning(f"Error getting file size with browser: {e}") - - return "Unknown Size" - else: - # Standard approach for normal URLs - async with self.context.new_page() as page: - response = await page.request.head(url, timeout=15000) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - else: - return "Unknown Size" - except Exception as e: - logger.warning(f"Error getting file size: {e}") - return "Unknown Size" - - async def get_pdf_metadata(self, url): - try: - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - resp = await page.request.get(url, timeout=15000) - if resp.ok: - content = await resp.body() - pdf = BytesIO(content) - reader = PyPDF2.PdfReader(pdf) - return { - 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', - 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', - 'Pages': len(reader.pages), - } - else: - return {} - except Exception as e: - logger.warning(f"Error reading PDF metadata: {e}") - return {} - - async def extract_real_download_url(self, url): - """Enhanced method to extract real download URL, handling complex URLs""" - try: - # Check if this is a complex download URL that needs special handling - if 'Action=downloadfile' in url or 'fname=' in url: - logger.info(f"Complex download URL detected: {url}") - - # For these special cases, we'll use the browser to navigate and intercept redirects - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - # Set up request interception to capture redirects - await page.route('**', lambda route: route.continue_()) - - # Listen for all responses - responses = [] - page.on('response', lambda response: responses.append(response)) - - try: - # Go to the URL - await page.goto(url, wait_until='networkidle', timeout=30000) - - # Check all responses for potential downloads - for response in responses: - # Look for content-disposition headers indicating a download - content_disposition = response.headers.get('Content-Disposition', '') - if 'attachment' in content_disposition or 'filename=' in content_disposition: - return response.url - - # Look for content-type headers indicating a file - content_type = response.headers.get('Content-Type', '') - if content_type and content_type != 'text/html' and not content_type.startswith('text/'): - return response.url - - # If no clear download was detected, return the final URL - return page.url - except Exception as e: - logger.warning(f"Error extracting real download URL: {e}") - return url - else: - # Standard approach for normal URLs - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - response = await page.goto(url, wait_until='networkidle', timeout=30000) - if response and response.headers.get('location'): - return response.headers['location'] - return page.url - except Exception as e: - logger.error(f"Error extracting real download URL: {e}") - return url - - # IMPROVED: Enhanced exam links extraction method - async def get_edu_exam_links(self, url): - """Specialized method for educational exam websites that follows a common pattern.""" - try: - logger.info(f"Fetching exam links from {url}") - links = set() - - # First try with direct requests for speed (but with proper headers) - headers = { - "User-Agent": get_random_user_agent(), - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Referer": "https://www.google.com/", - "DNT": "1" - } - - try: - response = requests.get(url, headers=headers, timeout=30) - - if response.status_code == 200: - # Parse with BeautifulSoup first for efficiency - soup = BeautifulSoup(response.text, "html.parser") - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Look for all links - for a in soup.find_all("a", href=True): - href = a["href"] - full_url = urljoin(url, href) - - # Look for text clues - link_text = a.get_text().lower() - - # Special patterns for exam sites (expanded list) - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_", - "/resource/", "/subject/", "/course/", "/material/" - ] - - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "subject", "course", "resource", "pdf", "document", - "view", "open", "get", "solution", "answer" - ] - - # Check URL for patterns - if any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - continue - - # Check link text for patterns - if any(pattern in link_text for pattern in text_patterns): - links.add(full_url) - continue - - # Check for common file extensions - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) - - # Check for download script parameters - if "Action=downloadfile" in url or "fname=" in url: - links.add(url) # Add the URL itself as it's a download link - except Exception as e: - logger.warning(f"Request-based extraction failed: {e}") - - # Browser-based approach for more thorough extraction or if initial approach was inadequate - try: - # Check if we need to proceed with browser-based extraction - if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: - logger.info("Using browser for enhanced link extraction") - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Navigate to the page with more natural timing - await self.page.goto(url, timeout=45000, wait_until='networkidle') - await self.page.wait_for_timeout(random.randint(1000, 2000)) - - # Handle captchas if present - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, extraction may be limited") - - # Get base URL for resolving relative links - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Perform natural scrolling to trigger lazy-loaded content - page_height = await self.page.evaluate("document.body.scrollHeight") - viewport_height = await self.page.evaluate("window.innerHeight") - - for scroll_pos in range(0, page_height, viewport_height // 2): - await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Scroll back to top - await self.page.evaluate("window.scrollTo(0, 0)") - await self.page.wait_for_timeout(500) - - # Extract all links with Playwright (better than just anchor tags) - all_links = await self.page.evaluate(""" - () => { - const results = []; - - // Get all anchor tags - const anchors = document.querySelectorAll('a[href]'); - for (const a of anchors) { - if (a.href) { - results.push({ - href: a.href, - text: a.innerText || a.textContent || '', - isButton: a.classList.contains('btn') || a.role === 'button' - }); - } - } - - // Get buttons that might contain links - const buttons = document.querySelectorAll('button'); - for (const btn of buttons) { - const onclick = btn.getAttribute('onclick') || ''; - if (onclick.includes('window.location') || onclick.includes('download')) { - results.push({ - href: '#button', - text: btn.innerText || btn.textContent || '', - isButton: true, - onclick: onclick - }); - } - } - - return results; - } - """) - - # Process the extracted links - for link_info in all_links: - href = link_info.get('href', '') - text = link_info.get('text', '').lower() - - if href and href != '#button': - # Check URL patterns - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_" - ] - - # Check text patterns - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "pdf", "document", "view", "open", "solution" - ] - - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(pattern in text for pattern in text_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - - # Check for download links in the page - download_links = await self.page.evaluate(""" - () => { - // Find all links that might be download links - const links = Array.from(document.querySelectorAll('a[href]')); - return links - .filter(a => { - const href = a.href.toLowerCase(); - return href.includes('download') || - href.includes('getfile') || - href.includes('view.php') || - href.includes('action=downloadfile') || - href.includes('fname='); - }) - .map(a => a.href); - } - """) - - for dl_link in download_links: - links.add(dl_link) - - # Check for ASP.NET specific elements that might contain exam links - grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') - for grid in grid_elements: - grid_links = await grid.query_selector_all('a[href]') - for a in grid_links: - href = await a.get_attribute('href') - text = await a.text_content() - - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - links.add(full_url) - - # Try clicking pagination controls to reveal more content - pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') - for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons - try: - # Check if this is a numeric pagination button (more likely to be useful) - button_text = await button.text_content() - if button_text and button_text.strip().isdigit(): - logger.info(f"Clicking pagination button: {button_text}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract links from this page - new_page_links = await self.page.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } - """) - - for href in new_page_links: - if href and not href.startswith('javascript:'): - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - except Exception as e: - logger.warning(f"Error clicking pagination button: {e}") - - # Try clicking any controls that might reveal more exam links (more focused approach) - show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') - for button in show_buttons: - button_text = (await button.text_content() or "").lower() - button_value = (await button.get_attribute("value") or "").lower() - button_id = (await button.get_attribute("id") or "").lower() - - # Look for buttons that seem likely to reveal file lists - promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", - "download", "resource", "material", "browse", "file"] - - if any(term in button_text or term in button_value or term in button_id - for term in promising_terms): - try: - logger.info(f"Clicking button: {button_text or button_value}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Get any new links that appeared - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - - # Focus on file extensions and patterns - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ - any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Special handling for ASP.NET PostBack links - try: - # Find and interact with ASP.NET __doPostBack elements - postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') - for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks - try: - onclick = await element.get_attribute('onclick') - if onclick and '__doPostBack' in onclick: - element_text = await element.text_content() - - # Only interact with elements that seem likely to contain exam links - promising_terms = ["show", "view", "list", "exam", "paper", "test", - "download", "resource", "material"] - - if any(term in element_text.lower() for term in promising_terms): - logger.info(f"Clicking ASP.NET postback element: {element_text}") - - # Click the element - await element.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract any new links - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) - except Exception as e: - logger.warning(f"Error interacting with postback element: {e}") - except Exception as e: - logger.warning(f"Error during postback handling: {e}") - - except Exception as e: - logger.error(f"Browser-based extraction failed: {e}") - - # Filter links to likely contain exam documents - filtered_links = [] - for link in links: - # Common file extensions for exam documents - if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - filtered_links.append(link) - continue - - # Common paths for exam documents - if any(pattern in link.lower() for pattern in [ - "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", - "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", - "/resource/", "/material/", "/notes/", "/subjectmaterial/" - ]): - filtered_links.append(link) - continue - - # Check for download links (these may not have obvious extensions) - if is_download_link(link): - filtered_links.append(link) - - logger.info(f"Found {len(filtered_links)} potential exam document links") - return filtered_links - - except Exception as e: - logger.error(f"Error getting exam links: {e}") - return [] - - async def discover_hidden_links(self, page): - """Discover hidden links that might be in JavaScript, iframes, or dynamic content""" - hidden_links = set() - - # Execute JavaScript to find links in script tags and data attributes - js_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Extract URLs from script tags - const scripts = document.querySelectorAll('script'); - for (const script of scripts) { - const content = script.textContent || ''; - const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; - for (let match of urlMatches) { - links.add(match.replace(/["']/g, '')); - } - } - - // Look for download-related variables in scripts - for (const script of scripts) { - const content = script.textContent || ''; - // Look for common patterns for file URLs in JavaScript - if (content.includes('downloadURL') || content.includes('fileURL') || - content.includes('pdfURL') || content.includes('documentURL')) { - - // Extract potential URLs - const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; - for (let match of potentialUrls) { - const url = match.replace(/["']/g, ''); - // Try to resolve relative URLs - if (url.startsWith('/') || !url.includes('://')) { - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else { - // Handle relative paths more carefully - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } else if (url.startsWith('http')) { - links.add(url); - } - } - } - } - - // Check for links in data attributes - const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); - for (const el of elements) { - for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { - const val = el.getAttribute(attr); - if (val) { - // Try to resolve relative URLs - if (val.startsWith('/')) { - links.add(window.location.origin + val); - } else if (val.startsWith('http')) { - links.add(val); - } else if (!val.startsWith('javascript:') && !val.startsWith('#')) { - // Handle relative paths - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + val); - } - } - } - } - - // Look for URLs in inline event handlers - const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); - for (const el of clickableElements) { - for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { - const val = el.getAttribute(attr); - if (val) { - // Check for JavaScript URLs with window.location - if (val.includes('window.location') || val.includes('document.location')) { - const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); - if (urlMatch && urlMatch[1]) { - const url = urlMatch[1]; - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else if (url.startsWith('http')) { - links.add(url); - } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } - } - - // Check for direct URLs in attributes - const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; - for (let match of urlMatches) { - links.add(match.replace(/["']/g, '')); - } - - // Check for download.php and similar patterns - if (val.includes('download.php') || val.includes('getfile.php') || - val.includes('Action=downloadfile') || val.includes('viewfile.php')) { - - // Handle both onclick handlers and direct hrefs - let url = ''; - if (attr === 'href') { - url = val; - } else { - // Extract URL from JavaScript - const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); - if (jsUrlMatch) { - url = jsUrlMatch[1]; - } - } - - // Resolve URL if needed - if (url) { - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else if (url.startsWith('http')) { - links.add(url); - } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } - } - } - } - } - - // Find PHP/ASP file download links - const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); - for (const link of fileLinks) { - links.add(link.href); - } - - return Array.from(links); - } - """) - - for link in js_links: - hidden_links.add(link) - - # Extract links from iframes - iframes = await page.query_selector_all('iframe') - for iframe in iframes: - try: - frame = await iframe.content_frame() - if frame: - iframe_links = await frame.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')) - .map(a => a.href) - .filter(href => href.startsWith('http')); - } - """) - for link in iframe_links: - hidden_links.add(link) - except Exception as e: - logger.warning(f"Could not extract links from iframe: {e}") - - # Look for links in shadow DOM (used in modern web components) - shadow_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Helper function to recursively process shadow roots - function processShadowRoot(root) { - if (!root) return; - - // Get links in this shadow root - const shadowLinks = root.querySelectorAll('a[href]'); - for (const link of shadowLinks) { - if (link.href && link.href.startsWith('http')) { - links.add(link.href); - } - } - - // Process nested shadow roots - const elements = root.querySelectorAll('*'); - for (const el of elements) { - if (el.shadowRoot) { - processShadowRoot(el.shadowRoot); - } - } - } - - // Find all shadow roots in the document - const elements = document.querySelectorAll('*'); - for (const el of elements) { - if (el.shadowRoot) { - processShadowRoot(el.shadowRoot); - } - } - - return Array.from(links); - } - """) - - for link in shadow_links: - hidden_links.add(link) - - # Look for download links in forms - form_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Check for form actions that might be download endpoints - const forms = document.querySelectorAll('form'); - for (const form of forms) { - const action = form.action || ''; - if (action && ( - action.includes('download') || - action.includes('getfile') || - action.includes('viewfile') || - action.includes('Action=downloadfile') - )) { - // Collect input values that might be needed for the download - const inputs = {}; - const formInputs = form.querySelectorAll('input[name]'); - for (const input of formInputs) { - inputs[input.name] = input.value; - } - - // Store both the form action and any important inputs - links.add(action); - } - } - - return Array.from(links); - } - """) - - for link in form_links: - hidden_links.add(link) - - return hidden_links - - async def extract_downloadable_files(self, url, custom_ext_list): - found_files = [] - try: - # Normalize the URL to handle special cases - normalized_url = normalize_download_url(url) - - # Skip if we've already visited this URL - if normalized_url in self.visited_urls: - logger.info(f"Skipping already visited URL: {normalized_url}") - return [] - - # Mark this URL as visited - self.visited_urls.add(normalized_url) - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # First check if this is a direct download link (Action=downloadfile or fname parameter) - if is_download_link(normalized_url): - logger.info(f"Processing potential direct download link: {normalized_url}") - - # Try to extract the real download URL if needed - real_url = await self.extract_real_download_url(normalized_url) - - # Determine filename - for complex URLs this can be tricky - filename = os.path.basename(urlparse(real_url).path) - - # Handle URL-encoded filenames - if '%' in filename: - try: - filename = unquote(filename) - except Exception: - pass - - # For URLs with download parameters, try to extract filename from query - if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): - # Look for file parameter - params = parse_qs(urlparse(normalized_url).query) - - # Check common filename parameters - for param in ['file', 'filename', 'name', 'fname', 'f']: - if param in params and params[param]: - potential_filename = params[param][0] - if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: - filename = os.path.basename(potential_filename) - break - - # If still no valid filename, use domain-based fallback - if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): - domain = get_domain(real_url) - # Try to determine file type from content-type or extension hints in URL - ext = '.pdf' # Default - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: - if common_ext in normalized_url.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - # Get file size - size_str = await self.get_file_size(real_url) - - # Add to found files - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': size_str, - 'metadata': {}, - 'download_url': normalized_url # Keep original URL for downloading - }) - - # For direct download links, we can return early - if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): - return found_files - - # Special handling for educational exam sites - if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): - logger.info("Using specialized handler for educational exam site") - - # Get direct links to exam files - exam_links = await self.get_edu_exam_links(url) - - for link in exam_links: - # Try to resolve any redirection - real_url = await self.extract_real_download_url(link) - filename = os.path.basename(urlparse(real_url).path) - - # If filename is URL encoded (common with Chinese/international sites) - if '%' in filename: - try: - filename = unquote(filename) - except Exception: - pass - - # If filename is empty or invalid, create a sensible one - if not filename or filename == '/': - domain = get_domain(real_url) - ext = '.pdf' # Default - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: - if common_ext in link.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - # Get file size - size_str = await self.get_file_size(real_url) - - # Get metadata for PDFs - meta = {} - if real_url.lower().endswith('.pdf'): - try: - meta = await self.get_pdf_metadata(real_url) - except Exception: - pass - - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': size_str, - 'metadata': meta, - 'download_url': link # Store original link for downloading - }) - - # If we found exam files with the specialized method, return them - if found_files: - return found_files - - # Standard extraction method if specialized method didn't find files - response = await self.page.goto(url, timeout=30000, wait_until='networkidle') - if not response: - return [] - - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, file extraction may be limited") - - # Scroll through the page naturally to trigger lazy loading - await self.page.evaluate(""" - (async () => { - const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const scrollStep = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += scrollStep) { - window.scrollTo(0, i); - await delay(100); - } - - window.scrollTo(0, 0); - })() - """) - await self.page.wait_for_timeout(1000) - - final_url = self.page.url - if '.php' in final_url or 'download' in final_url: - real_url = await self.extract_real_download_url(final_url) - if real_url != final_url: - # Try to detect the filename from headers or URL - response = await self.page.request.head(real_url, timeout=15000) - filename = None - - # Try to get from Content-Disposition header - content_disposition = response.headers.get('Content-Disposition', '') - if 'filename=' in content_disposition: - filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) - if filename_match: - filename = filename_match.group(1) - - # If not found in headers, use URL basename - if not filename: - filename = os.path.basename(urlparse(real_url).path) - if not filename or filename == '/': - # Generate a name based on domain - domain = get_domain(real_url) - ext = '.pdf' # Default - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: - if common_ext in real_url.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': await self.get_file_size(real_url), - 'metadata': {}, - 'download_url': final_url # Keep original URL for downloading - }) - return found_files - - await self.page.wait_for_load_state('networkidle', timeout=30000) - content = await self.page.content() - soup = BeautifulSoup(content, 'html.parser') - - default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', - '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', - '.pptx', '.odt', '.txt'] - all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) - - parsed_base = urlparse(final_url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - path_base = os.path.dirname(parsed_base.path) - - # Process all anchor tags - for a in soup.find_all('a', href=True): - href = a['href'].strip() - - if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - real_url = await self.extract_real_download_url(full_url) - if real_url and real_url != full_url: - found_files.append({ - 'url': real_url, - 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', - 'size': await self.get_file_size(real_url), - 'metadata': {}, - 'download_url': full_url # Original URL for download - }) - continue - - if any(href.lower().endswith(ext) for ext in all_exts): - file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - size_str = await self.get_file_size(file_url) - meta = {} - if file_url.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(file_url) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': file_url # Same as URL for direct links - }) - - # Handle Google Drive links - elif ("drive.google.com" in href) or ("docs.google.com" in href): - file_id = None - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, href) - if match: - file_id = match.group(1) - break - if file_id: - # Get file info to determine type and view-only status - file_type, is_view_only = await self.get_google_drive_file_info(file_id) - - # Create a more informative filename based on info - filename = f"gdrive_{file_id}" - if file_type: - filename = f"{filename}.{file_type}" - - size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") - - found_files.append({ - 'url': href, # Use original URL - 'filename': filename, - 'size': size_str, - 'metadata': { - 'view_only': is_view_only, - 'file_type': file_type, - 'file_id': file_id - }, - 'download_url': href # Same as URL for Google Drive - }) - - # Also check for files in other elements (iframe, embed, object, etc.) - other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) - for elem in other_elements: - src = elem.get('src') or elem.get('data') - if src and any(src.lower().endswith(ext) for ext in all_exts): - file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - size_str = await self.get_file_size(file_url) - meta = {} - if file_url.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(file_url) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': file_url - }) - - # Check for file links in onclick attributes - onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') - for elem in onclick_elements: - onclick = await elem.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url_match in urls: - if any(url_match.lower().endswith(ext) for ext in all_exts): - size_str = await self.get_file_size(url_match) - meta = {} - if url_match.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(url_match) - found_files.append({ - 'url': url_match, - 'filename': os.path.basename(url_match.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': url_match - }) - - # Also check for data-src and data-url attributes (common in lazy-loaded sites) - data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') - for elem in data_elements: - for attr in ['data-src', 'data-url', 'data-href', 'data-download']: - try: - value = await elem.get_attribute(attr) - if value and any(value.lower().endswith(ext) for ext in all_exts): - file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': await self.get_file_size(file_url), - 'metadata': {}, - 'download_url': file_url - }) - except: - pass - - # Check script tags for JSON data that might contain file URLs - script_elements = soup.find_all('script', type='application/json') - for script in script_elements: - try: - json_data = json.loads(script.string) - # Look for URL patterns in the JSON data - def extract_urls_from_json(obj, urls_found=None): - if urls_found is None: - urls_found = [] - if isinstance(obj, dict): - for k, v in obj.items(): - # Check if any key contains url-like terms - url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] - if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): - urls_found.append(v) - else: - extract_urls_from_json(v, urls_found) - elif isinstance(obj, list): - for item in obj: - extract_urls_from_json(item, urls_found) - return urls_found - - json_urls = extract_urls_from_json(json_data) - for json_url in json_urls: - if any(json_url.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': json_url, - 'filename': os.path.basename(json_url.split('?')[0]), - 'size': await self.get_file_size(json_url), - 'metadata': {}, - 'download_url': json_url - }) - except: - pass - - # Check for hidden download buttons or forms - hidden_elements = await self.page.evaluate(""" - () => { - const results = []; - - // Check for hidden forms with download actions - const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); - for (const form of forms) { - const action = form.getAttribute('action') || ''; - results.push({ - type: 'form', - action: action, - inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { - return {name: input.name, value: input.value}; - }) - }); - } - - // Check for hidden download links/buttons - const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { - const style = window.getComputedStyle(a); - return (style.display === 'none' || style.visibility === 'hidden') && - (a.href.includes('download') || a.href.includes('file')); - }); - - for (const link of hiddenLinks) { - results.push({ - type: 'link', - href: link.href, - text: link.innerText || link.textContent - }); - } - - return results; - } - """) - - # Process hidden elements - for elem in hidden_elements: - if elem['type'] == 'link' and 'href' in elem: - href = elem['href'] - if any(href.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': href, - 'filename': os.path.basename(href.split('?')[0]), - 'size': await self.get_file_size(href), - 'metadata': {}, - 'download_url': href - }) - - # Check for hidden links that might be in JavaScript, iframes, or dynamic content - hidden_links = await self.discover_hidden_links(self.page) - for link in hidden_links: - if any(link.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': link, - 'filename': os.path.basename(link.split('?')[0]), - 'size': await self.get_file_size(link), - 'metadata': {}, - 'download_url': link - }) - - # Deduplicate files by URL - seen_urls = set() - unique_files = [] - for f in found_files: - if f['url'] not in seen_urls: - seen_urls.add(f['url']) - unique_files.append(f) - - return unique_files - except Exception as e: - logger.error(f"Error extracting files from {url}: {e}") - traceback.print_exc() - return [] - - async def download_file(self, file_info, save_dir, referer): - file_url = file_info.get('download_url', file_info['url']) # Use download_url if available - fname = file_info['filename'] - path = os.path.join(save_dir, fname) - base, ext = os.path.splitext(fname) - counter = 1 - while os.path.exists(path): - path = os.path.join(save_dir, f"{base}_{counter}{ext}") - counter += 1 - os.makedirs(save_dir, exist_ok=True) - - # Check if we've already downloaded this file - if file_url in self.downloaded_files: - logger.info(f"File already downloaded: {file_url}") - return None - - try: - # Special handling for Google Drive files - if "drive.google.com" in file_url or "docs.google.com" in file_url: - # Check if it's marked as view-only in metadata - is_view_only = file_info.get('metadata', {}).get('view_only', False) - - # For view-only files, try our most robust approach first - if is_view_only: - logger.info(f"Attempting to download view-only file: {file_url}") - result_path = await self._force_download_viewonly(file_info, path) - if result_path: - self.downloaded_files.add(file_url) - return result_path - - # If that failed, try the regular download approach - logger.info("Primary method failed, trying fallback methods") - - # Try regular download methods - success = await self._download_from_google_drive(file_url, path) - if success: - self.downloaded_files.add(file_url) - return path - - # If all methods failed for Google Drive, try one last approach - logger.warning("All standard methods failed, attempting force download") - result_path = await self._force_download_viewonly(file_info, path) - if result_path: - self.downloaded_files.add(file_url) - return result_path if result_path else None - - # Special handling for complex download URLs - if 'Action=downloadfile' in file_url or 'fname=' in file_url: - logger.info(f"Using browser download approach for complex URL: {file_url}") - - # For these URLs, we'll need to navigate to the page and handle the download - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Navigate to the URL - await page.goto(file_url, timeout=60000) - - # Wait for the download to start - try: - download = await download_promise - await download.save_as(path) - - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.error(f"Browser download failed: {e}") - - # If download didn't start automatically, try to find and click download buttons - download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') - for button in download_buttons: - try: - await button.click() - try: - download = await download_promise - await download.save_as(path) - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except: - pass - except: - continue - - # If browser approach failed, try direct request as last resort - logger.info("Browser approach failed, trying direct request") - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Try with direct requests first (faster) - try: - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer, - 'DNT': '1' - } - - with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: - if response.status_code == 200: - # Check content type to verify it's not HTML/error page - content_type = response.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_url.endswith('.html'): - logger.warning(f"Received HTML instead of expected file: {file_url}") - else: - with open(path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file was downloaded correctly - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.warning(f"Direct download failed: {e}, trying browser approach") - - # Original code for non-Google Drive downloads using Playwright - async with self.context.new_page() as page: - headers = { - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer - } - - # Try to download with timeout protection - try: - response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) - if response.status == 200: - content = await response.body() - with open(path, 'wb') as f: - f.write(content) - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - else: - logger.error(f"Download failed with status {response.status}: {file_url}") - - # Try to extract error information - error_info = await response.text() - logger.debug(f"Error response: {error_info[:200]}...") - - # Check if this might be a captcha or login issue - if detect_captcha(error_info): - logger.warning("Captcha detected during download") - # For HF Spaces, we can't implement browser-based captcha solving here - # Just log the issue for now - except PlaywrightTimeoutError: - logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") - - # Try an alternative approach - using the browser's download manager - try: - logger.info("Trying browser download manager approach") - download_promise = page.wait_for_event("download") - await page.goto(file_url, timeout=60000) - - # Wait for download to start (with timeout) - download = await download_promise - await download.save_as(path) - - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.error(f"Browser download manager approach failed: {e}") - - return None - except Exception as e: - logger.error(f"Error downloading {file_url}: {e}") - return None - - # IMPROVED: Split force_download_viewonly into smaller methods - async def _force_download_viewonly(self, file_info, save_path): - """Main method to handle view-only files, now simplified""" - # Extract the file ID - file_id = self._extract_drive_file_id(file_info) - if not file_id: - logger.error("Could not extract file ID") - return None - - # Get file type information - file_type = file_info.get('metadata', {}).get('file_type', 'pdf') - base, ext = os.path.splitext(save_path) - if not ext: - save_path = f"{base}.{file_type}" - - logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") - - # Create a stealth browser for handling the download - browser = await self._create_stealth_browser() - - try: - # Set up the browser page - page = await browser.new_page() - - # Go to the file view page - logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) - await page.wait_for_load_state('networkidle') - - # Check for permission issues - content = await page.content() - if "the owner has not granted you permission to" in content: - logger.warning("Permission denied error detected") - return None - - # Wait for the page to stabilize - await page.wait_for_timeout(random.randint(3000, 7000)) - - # Create temp directory for working files - temp_dir = tempfile.mkdtemp() - - # Handle different file types - if file_type.lower() == 'pdf': - return await self._download_viewonly_pdf(page, file_id, save_path, temp_dir) - else: - return await self._download_viewonly_other(page, file_id, file_type, save_path, temp_dir) - - except Exception as e: - logger.error(f"Error during force download: {e}") - return None - finally: - await browser.close() - - def _extract_drive_file_id(self, file_info): - """Extract Google Drive file ID from file info""" - # Try to get file ID from metadata - file_id = file_info.get('metadata', {}).get('file_id') - if file_id: - return file_id - - # If not in metadata, try to extract from URL - url = file_info.get('url', '') - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, url) - if match: - return match.group(1) - - return None - - async def _create_stealth_browser(self): - """Create a stealth browser instance for handling sensitive downloads""" - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-site-isolation-trials', - '--disable-blink-features=AutomationControlled' # Anti-detection - ] - - browser = await self.playwright.chromium.launch( - headless=True, - args=browser_args - ) - - # Use higher resolution for better quality - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - device_scale_factor=2.0, - accept_downloads=True # Critical for the download workflow - ) - - # Add anti-detection script - await context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - // Handle languages - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - } - """) - - return browser - - async def _download_viewonly_pdf(self, page, file_id, save_path, temp_dir): - """Handle downloading view-only PDF files""" - try: - # Estimate number of pages - estimated_pages = await page.evaluate(""" - () => { - // Method 1: Check page counter text - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); - }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // Method 2: Check actual page elements - const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pageElements.length > 0) return pageElements.length; - - // Method 3: Look for page thumbnails - const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); - if (thumbnails.length > 0) return thumbnails.length; - - // Fallback: conservative guess - return 50; - } - """) - - logger.info(f"Estimated {estimated_pages} pages in PDF") - - # Initial scroll to trigger lazy loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Scroll page by page to ensure all pages are loaded - logger.info("Scrolling page by page...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 - - while attempt < max_attempts: - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) - - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - - if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): - logger.info("All pages appear to be loaded.") - break - - # Alternate between PageDown and End keys for more natural scrolling - if attempt % 3 == 0: - await page.keyboard.press("End") - else: - await page.keyboard.press("PageDown") - - # Randomized wait times - await page.wait_for_timeout(random.randint(1500, 3000)) - - # Move mouse randomly to appear more human-like - if attempt % 4 == 0: - await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is loaded - await page.wait_for_timeout(5000) - - # Set up download event listener for the PDF - download_promise = page.wait_for_event("download") - - # Use jsPDF to generate PDF from loaded pages - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = Array.from(document.getElementsByTagName("img")) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); - - console.log(`Found ${imgs.length} valid page images to add to PDF`); - - let added = 0; - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; - } - - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); - } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success', False): - logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") - - # Try fallback approach - screenshot method - logger.info("Trying fallback screenshot method...") - return await self._pdf_screenshot_fallback(page, estimated_pages, save_path, temp_dir) - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Wait for the download and save it - download = await download_promise - await download.save_as(save_path) - - # Clean up temp directory - try: - os.rmdir(temp_dir) - except: - pass - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: - logger.info(f"Successfully downloaded PDF to {save_path}") - return save_path - else: - logger.error(f"Generated file is too small or missing: {save_path}") - return None - - except Exception as e: - logger.error(f"Error in PDF download: {e}") - return None - - async def _pdf_screenshot_fallback(self, page, estimated_pages, save_path, temp_dir): - """Fallback method using screenshots for PDF creation""" - try: - # Navigate back to the first page - await page.evaluate(""" - () => { - // Find and click the "first page" button if available - const buttons = Array.from(document.querySelectorAll('button')); - const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); - if (firstPageBtn) firstPageBtn.click(); - } - """) - await page.wait_for_timeout(1000); - - # Create a PDF by taking screenshots of each page - screenshots = [] - current_page = 1 - max_pages = estimated_pages - - # Create a PDF using the reportlab package - while current_page <= max_pages: - screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") - - # Try to find the current page element - page_elem = await page.query_selector('.drive-viewer-paginated-page') - if page_elem: - await page_elem.screenshot(path=screenshot_path) - else: - # Fallback to full page screenshot - await page.screenshot(path=screenshot_path) - - screenshots.append(screenshot_path) - - # Try to navigate to next page - next_btn = await page.query_selector('button[aria-label="Next page"]') - if next_btn: - is_disabled = await next_btn.get_attribute('disabled') - if is_disabled: - logger.info(f"Reached end of document at page {current_page}") - break - - await next_btn.click() - await page.wait_for_timeout(1000) - current_page += 1 - else: - break - - # Create PDF from screenshots - if screenshots: - first_img = Image.open(screenshots[0]) - width, height = first_img.size - - c = canvas.Canvas(save_path, pagesize=(width, height)) - for screenshot in screenshots: - img = Image.open(screenshot) - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - c.save() - - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) - - return save_path - - return None - except Exception as e: - logger.error(f"Error in screenshot fallback: {e}") - return None - - async def _download_viewonly_other(self, page, file_id, file_type, save_path, temp_dir): - """Handle downloading non-PDF view-only files""" - try: - # Take a screenshot of the file - screenshot_path = os.path.join(temp_dir, "file.png") - await page.screenshot(path=screenshot_path) - - if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: - # For document types, try to export directly - success = await self._export_google_doc(file_id, file_type, save_path) - if success: - os.remove(screenshot_path) - return save_path - - # If export fails, fall back to screenshot - logger.warning(f"Export failed, falling back to screenshot for {file_type}") - - # For other types or if export failed, save the screenshot with appropriate extension - shutil.copy(screenshot_path, save_path) - os.remove(screenshot_path) - - return save_path if os.path.exists(save_path) else None - - except Exception as e: - logger.error(f"Error in non-PDF download: {e}") - return None - - async def _download_from_google_drive(self, url, save_path): - """Enhanced method to download from Google Drive with multiple fallback approaches""" - # Extract the file ID from different URL formats - file_id = self._extract_drive_file_id({"url": url}) - if not file_id: - logger.error(f"Could not extract file ID from URL: {url}") - return False - - # Determine file type first (important for handling different file types) - file_type, is_view_only = await self._get_google_drive_file_info(file_id) - logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") - - base, ext = os.path.splitext(save_path) - if not ext and file_type: - # Add the correct extension if missing - save_path = f"{base}.{file_type}" - - # For view-only files, use specialized approaches - if is_view_only: - # Approach 1: For PDFs, use the JS method - if file_type == 'pdf': - success = await self._download_viewonly_pdf_with_js(file_id, save_path) - if success: - return True - - # Approach 2: For Google Docs, Sheets, etc., use export API - if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: - success = await self._export_google_doc(file_id, file_type, save_path) - if success: - return True - - # Fallback to the main view-only method - result_path = await self._force_download_viewonly({ - 'url': url, - 'metadata': {'file_id': file_id, 'file_type': file_type, 'view_only': True} - }, save_path) - - return bool(result_path) - - # Try standard approaches for non-view-only files - try: - # Try direct download link first (fastest) - direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" - - # Add anti-bot headers - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://drive.google.com/', - 'DNT': '1' - } - - # Try with streaming to handle larger files - with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: - if r.status_code == 200: - # Check if we got HTML instead of the file - content_type = r.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_id.endswith('.html'): - logger.warning("Received HTML instead of file, trying with session cookies") - else: - # Looks like we got the actual file - with open(save_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 0: - logger.info("Direct download successful") - return True - - # Try browser-based approach as last resort - try: - async with self.context.new_page() as page: - # Visit the file view page first to get cookies - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - await page.wait_for_timeout(3000) - - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Try to trigger the download button click - download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') - if download_button: - await download_button.click() - - # Wait for download to start - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except Exception as e: - logger.error(f"Error during browser download: {e}") - return False - else: - # Try the export download URL - await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) - - # Look for and click any download buttons or links - download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') - for elem in download_elements: - try: - await elem.click() - # Wait a bit to see if download starts - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except: - pass - except: - continue - except Exception as e: - logger.error(f"Browser-based download attempt failed: {e}") - - logger.warning("All standard download methods failed") - return False - except Exception as e: - logger.error(f"Error in Google Drive download: {e}") - return False - - async def _download_viewonly_pdf_with_js(self, file_id, save_path): - """Download view-only PDF using blob images and JS""" - try: - # Create a dedicated browser instance - browser = await self._create_stealth_browser() - page = await browser.new_page() - - try: - # Navigate to the file with human-like behavior - logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) - await page.wait_for_load_state('networkidle') - - # Perform human-like interactions - await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) - await page.wait_for_timeout(random.randint(2000, 5000)) - - # Estimate the number of pages - estimated_pages = await page.evaluate(""" - () => { - // Look for page counter in the interface - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); - }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // If we can't find a counter, check actual pages - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length > 0) return pages.length; - - // Default to a reasonable number if we can't determine - return 50; - } - """) - - logger.info(f"Estimated number of pages: {estimated_pages}") - - # Initial scroll to trigger loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Scroll through document with variety to appear natural - await self._natural_scroll_through_document(page, estimated_pages) - - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Use jsPDF to generate PDF from loaded pages - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = Array.from(document.getElementsByTagName("img")) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); - - console.log(`Found ${imgs.length} valid page images to add to PDF`); - - let added = 0; - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; - } - - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); - } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success'): - logger.error(f"Error in PDF generation: {result.get('error')}") - return False - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Wait for the download to complete and save the file - download = await download_promise - - # Save the downloaded file to the specified path - await download.save_as(save_path) - logger.info(f"Successfully saved PDF to {save_path}") - - return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 - - finally: - await browser.close() - - except Exception as e: - logger.error(f"Error in viewonly PDF download process: {e}") - return False - - async def _natural_scroll_through_document(self, page, estimated_pages): - """Scroll through document in a natural way to load all pages""" - logger.info("Scrolling through document to load all pages...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 - consecutive_same_count = 0 - - while attempt < max_attempts: - # Count blob images (which are the PDF pages) - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) - - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - - # Check if we've loaded all pages or if we're stuck - if blob_count >= estimated_pages: - logger.info(f"All {estimated_pages} pages appear to be loaded.") - break - - if blob_count == prev_blob_count: - consecutive_same_count += 1 - if consecutive_same_count >= 5 and blob_count > 0: - logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") - break - else: - consecutive_same_count = 0 - - # Mix up the scrolling approach for more human-like behavior - scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) - - if scroll_action == "PageDown": - await page.keyboard.press("PageDown") - elif scroll_action == "End": - await page.keyboard.press("End") - elif scroll_action == "ArrowDown": - # Press arrow down multiple times - for _ in range(random.randint(5, 15)): - await page.keyboard.press("ArrowDown") - await page.wait_for_timeout(random.randint(50, 150)) - else: # mouse - # Scroll using mouse wheel - current_y = random.randint(300, 700) - await page.mouse.move(x=random.randint(300, 800), y=current_y) - await page.mouse.wheel(0, random.randint(300, 800)) - - # Random wait between scrolls - await page.wait_for_timeout(random.randint(1000, 3000)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is fully loaded - await page.wait_for_timeout(5000) - - async def _export_google_doc(self, file_id, file_type, save_path): - """Export Google Docs/Sheets/Slides to downloadable formats""" - try: - # Map file types to export formats - export_urls = { - 'doc': f"https://docs.google.com/document/d/{file_id}/export?format=doc", - 'docx': f"https://docs.google.com/document/d/{file_id}/export?format=docx", - 'sheet': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", - 'xlsx': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", - 'ppt': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", - 'pptx': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", - 'pdf': f"https://docs.google.com/document/d/{file_id}/export?format=pdf" - } - - export_url = export_urls.get(file_type, f"https://docs.google.com/document/d/{file_id}/export?format=pdf") - - async with self.context.new_page() as page: - # Get cookies from the main view page first - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') - - # Now try the export - response = await page.goto(export_url, wait_until='networkidle') - - if response.status == 200: - content = await response.body() - with open(save_path, 'wb') as f: - f.write(content) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - else: - logger.warning(f"Export failed with status {response.status}") - return False - - except Exception as e: - logger.error(f"Error exporting Google Doc: {e}") - return False - - async def _get_google_drive_file_info(self, file_id): - """Get file type and view-only status from Google Drive""" - file_type = None - is_view_only = False - - try: - async with self.context.new_page() as page: - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - - # Check if view-only - view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') - is_view_only = view_only_text is not None - - # Check for Google Docs viewer - gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') - gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') - gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') - - if gdocs_viewer: - file_type = 'docx' - elif gsheets_viewer: - file_type = 'xlsx' - elif gslides_viewer: - file_type = 'pptx' - else: - # Check for PDF viewer - pdf_viewer = await page.query_selector('embed[type="application/pdf"]') - if pdf_viewer: - file_type = 'pdf' - else: - # Check for image viewer - img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') - if img_viewer: - # Get image type from src - img_src = await img_viewer.get_attribute('src') - if 'jpg' in img_src or 'jpeg' in img_src: - file_type = 'jpg' - elif 'png' in img_src: - file_type = 'png' - else: - file_type = 'jpg' # Default to jpg - else: - # Generic file type fallback - file_type = 'pdf' # Default to PDF - - # If still no type, check filename - if not file_type: - title_element = await page.query_selector('div[role="heading"]') - if title_element: - title = await title_element.text_content() - if title: - ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) - if ext_match: - file_type = ext_match.group(1).lower() - - except Exception as e: - logger.error(f"Error getting Google Drive file info: {e}") - file_type = 'pdf' # Default to PDF if we can't determine - - return file_type, is_view_only - - # IMPROVED: Enhanced sublink extraction method - async def get_sublinks(self, url, limit=10000): - """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" - links = set() - try: - logger.info(f"Fetching sublinks from: {url}") - - # Check if this is a direct download link - if is_download_link(url): - logger.info(f"URL appears to be a direct download link: {url}") - links.add(url) - return list(links)[:limit] - - # Skip if we've already visited this URL - normalized_url = normalize_download_url(url) - if normalized_url in self.visited_urls: - logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") - return list(links)[:limit] - - # Add to visited URLs - self.visited_urls.add(normalized_url) - - # Special handling for educational sites like phsms.cloud.ncnu.edu.tw - if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): - logger.info("Using specialized exam site sublink extraction") - edu_links = await self.get_edu_exam_links(url) - for link in edu_links: - links.add(link) - - # If we found a good number of links with the specialized method, return them - if len(links) > 5: - logger.info(f"Found {len(links)} sublinks with specialized method") - return list(links)[:limit] - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Standard sublink extraction for all sites - try: - await self.page.goto(url, timeout=30000, wait_until='networkidle') - except Exception as e: - logger.warning(f"Error navigating to URL for sublink extraction: {e}") - # Continue with what we have, we'll try to extract links anyway - - # Get base URL for resolving relative links - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - path_base = os.path.dirname(parsed_base.path) - - # Perform initial scrolling to load lazy content - await self.page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const step = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += step) { - window.scrollTo(0, i); - await delay(150); - } - - window.scrollTo(0, 0); - } - """) - await self.page.wait_for_timeout(1000) - - # Check if page has ASP.NET elements which might need special handling - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') - - if is_aspnet: - logger.info("Detected ASP.NET page, using enhanced extraction method") - - # Try to interact with ASP.NET controls that might reveal more links - # Look for dropdowns, buttons, and grid elements - dropdowns = await self.page.query_selector_all('select') - buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') - - # Try interacting with dropdowns first - for dropdown in dropdowns: - try: - # Get all options - options = await self.page.evaluate(''' - (dropdown) => { - return Array.from(dropdown.options).map(o => o.value); - } - ''', dropdown) - - # Try selecting each option - for option in options: - if option: - await dropdown.select_option(value=option) - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error interacting with dropdown: {e}") - - # Try clicking buttons (but avoid dangerous ones like "delete") - safe_buttons = [] - for button in buttons: - button_text = await button.text_content() or "" - button_value = await button.get_attribute("value") or "" - button_id = await button.get_attribute("id") or "" - combined_text = (button_text + button_value + button_id).lower() - - # Skip potentially destructive buttons - if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): - continue - - # Prioritize buttons that might show more content - if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): - safe_buttons.append(button) - - # Click the safe buttons - for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks - try: - await button.click() - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Extract links from the initial page state - await self.extract_all_link_types(links, base_url, path_base) - - # Look specifically for links inside grid/table views which are common in ASP.NET applications - grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') - for cell in grid_cells: - try: - href = await cell.get_attribute('href') - if href: - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links.add(full_url) - except Exception as e: - logger.warning(f"Error extracting grid link: {e}") - - # Extract links from onclick attributes and javascript:__doPostBack calls - postback_links = await self.page.evaluate(''' - () => { - const results = []; - // Find elements with onclick containing __doPostBack - const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); - for (const el of elements) { - // Extract the postback target - const onclick = el.getAttribute('onclick') || ''; - const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); - if (match && match[1]) { - // Get the visible text to use as description - const text = el.innerText || el.textContent || 'Link'; - results.push({ - id: match[1], - text: text.trim() - }); - } - } - return results; - } - ''') - - # Try interacting with some of the postback links - for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions - try: - logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") - await self.page.evaluate(f''' - () => {{ - if (typeof __doPostBack === 'function') {{ - __doPostBack('{postback["id"]}', ''); - }} - }} - ''') - await self.page.wait_for_timeout(1500) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error with postback: {e}") - - # Look for pagination controls and try to navigate through them - pagination_elements = await self.page.query_selector_all( - 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' - ) - - # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops) - for i in range(min(5, len(pagination_elements))): - try: - # Focus on elements that look like "next page" buttons - el = pagination_elements[i] - el_text = await el.text_content() or "" - - # Only click if this looks like a pagination control - if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip(): - logger.info(f"Clicking pagination control: {el_text}") - await el.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Get new links from this page - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking pagination: {e}") - - # Check for hidden links that might be revealed by JavaScript - hidden_links = await self.page.evaluate(""" - () => { - // Try to execute common JavaScript patterns that reveal hidden content - try { - // Common patterns used in websites to initially hide content - const hiddenContainers = document.querySelectorAll( - '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' - ); - - // Attempt to make them visible - hiddenContainers.forEach(el => { - el.style.display = 'block'; - el.style.visibility = 'visible'; - el.classList.remove('hidden', 'hide'); - }); - - // Return any newly visible links - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } catch (e) { - return []; - } - } - """) - - # Add any newly discovered links - for href in hidden_links: - if href and not href.startswith('javascript:'): - links.add(href) - - # Find all download links - download_links = await self.page.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')) - .filter(a => { - const href = a.href.toLowerCase(); - return href.includes('download') || - href.includes('file') || - href.includes('get') || - href.includes('view.php') || - href.includes('action=') || - href.includes('fname='); - }) - .map(a => a.href); - } - """) - - for download_link in download_links: - links.add(download_link) - - # Also check for hidden links in JavaScript, iframes, or dynamic content - js_links = await self.discover_hidden_links(self.page) - for link in js_links: - links.add(link) - - logger.info(f"Found {len(links)} sublinks") - - # Prioritize download links - prioritized_links = [] - normal_links = [] - - for link in links: - if is_download_link(link): - prioritized_links.append(link) - else: - normal_links.append(link) - - # Return prioritized links first, then normal links, up to the limit - result = prioritized_links + normal_links - return result[:limit] - - except Exception as e: - logger.error(f"Error getting sublinks from {url}: {e}") - return list(links)[:limit] # Return what we have so far - - async def extract_all_link_types(self, links_set, base_url, path_base): - """Extract all types of links from the current page""" - # Get all tag links - a_links = await self.page.query_selector_all('a[href]') - for a in a_links: - try: - href = await a.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get iframe sources - iframes = await self.page.query_selector_all('iframe[src]') - for iframe in iframes: - try: - src = await iframe.get_attribute('src') - if src and not src.startswith('javascript:') and not src.startswith('about:'): - full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get links from onclick attributes that reference URLs - onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') - for el in onclick_elements: - try: - onclick = await el.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url in urls: - links_set.add(url) - except Exception: - pass - - # Look for URLs in data-* attributes - data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') - for el in data_elements: - for attr in ['data-url', 'data-href', 'data-src']: - try: - value = await el.get_attribute(attr) - if value and not value.startswith('javascript:'): - full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Look for special anchor links that might not have href attributes - special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') - for anchor in special_anchors: - try: - href = await anchor.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Extract links from JSON data embedded in the page - script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') - for script in script_elements: - try: - script_content = await script.text_content() - if script_content: - # Look for URLs in the JSON content - urls = re.findall(r'(https?://[^\'"]+)', script_content) - for url in urls: - links_set.add(url) - except Exception: - pass - - def resolve_relative_url(self, relative_url, base_url, path_base): - """Properly resolve relative URLs considering multiple formats""" - if relative_url.startswith('/'): - # Absolute path relative to domain - return f"{base_url}{relative_url}" - elif relative_url.startswith('./'): - # Explicit relative path - return f"{base_url}{path_base}/{relative_url[2:]}" - elif relative_url.startswith('../'): - # Parent directory - parent_path = '/'.join(path_base.split('/')[:-1]) - return f"{base_url}{parent_path}/{relative_url[3:]}" - else: - # Regular relative path - return f"{base_url}{path_base}/{relative_url}" - - async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): - """Perform a deep search for files at the URL and its sublinks""" - import streamlit as st - - if not custom_ext_list: - custom_ext_list = [] - progress_text = st.empty() - progress_bar = st.progress(0) - file_count_text = st.empty() - - try: - # Reset the visited URLs for a fresh deep search - self.visited_urls = set() - - progress_text.text("🔍 Analyzing main page...") - # Special handling for ASP.NET pages - is_aspnet = False - try: - await self.page.goto(url, timeout=30000, wait_until='networkidle') - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') - except Exception: - pass - - # Check if this URL is a direct download - if is_download_link(url): - progress_text.text("📥 URL appears to be a direct download. Processing...") - - # Try to extract file directly - normalized_url = normalize_download_url(url) - file_info = { - 'url': normalized_url, - 'download_url': normalized_url, - 'filename': os.path.basename(urlparse(normalized_url).path) or 'download', - 'size': 'Unknown Size', - 'metadata': {} - } - - # Add to visited URLs - self.visited_urls.add(normalized_url) - progress_bar.progress(1.0) - return [file_info] - - # Extract files from main page - progress_text.text("📄 Extracting files from main page...") - main_files = await self.extract_downloadable_files(url, custom_ext_list) - initial_count = len(main_files) - file_count_text.text(f"Found {initial_count} files on main page") - - # Get sublinks with enhanced method - progress_text.text("🔗 Getting sublinks...") - sublinks = await self.get_sublinks(url, sublink_limit) - total_links = len(sublinks) - progress_text.text(f"Found {total_links} sublinks to process") - - # Always include files from the main page, regardless of sublinks - all_files = main_files - - if not sublinks: - progress_bar.progress(1.0) - return all_files - - # Process each sublink - for i, sublink in enumerate(sublinks, 1): - progress = i / total_links - progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") - progress_bar.progress(progress) - - try: - # Check if this is a direct download link - if is_download_link(sublink): - # For download links, just add the link directly - normalized_url = normalize_download_url(sublink) - - # Skip if already visited - if normalized_url in self.visited_urls: - continue - - # Mark as visited - self.visited_urls.add(normalized_url) - - # Get file size if possible - size_str = await self.get_file_size(normalized_url) - - # Get filename, with fallback to domain-based name - filename = os.path.basename(urlparse(normalized_url).path) - if not filename or filename == '/' or '?' in filename: - domain = get_domain(normalized_url) - ext = '.pdf' # Default extension - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: - if common_ext in normalized_url.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - # Add file to results - all_files.append({ - 'url': normalized_url, - 'download_url': normalized_url, - 'filename': filename, - 'size': size_str, - 'metadata': {} - }) - file_count_text.text(f"Found {len(all_files)} total files") - continue - - # For regular links, use a longer timeout for ASP.NET pages which can be slower - sub_timeout = timeout * 2 if is_aspnet else timeout - - # Skip already visited URLs - if sublink in self.visited_urls: - continue - - # Extract files from sublink - sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) - all_files.extend(sub_files) - file_count_text.text(f"Found {len(all_files)} total files") - except Exception as e: - logger.warning(f"Error processing sublink {sublink}: {e}") - - # Deduplicate files - seen_urls = set() - unique_files = [] - for f in all_files: - if f['url'] not in seen_urls: - seen_urls.add(f['url']) - unique_files.append(f) - - final_count = len(unique_files) - progress_text.text(f"✅ Deep search complete!") - file_count_text.text(f"Found {final_count} unique files") - progress_bar.progress(1.0) - return unique_files - - except Exception as e: - logger.error(f"Deep search error: {e}") - progress_text.text(f"⚠️ Error during deep search: {str(e)}") - return [] - - finally: - await asyncio.sleep(2) - if not st.session_state.get('keep_progress', False): - progress_text.empty() - progress_bar.empty() \ No newline at end of file