diff --git "a/app/download_manager.py" "b/app/download_manager.py" new file mode 100644--- /dev/null +++ "b/app/download_manager.py" @@ -0,0 +1,2932 @@ +import os +import re +import random +import asyncio +import logging +import traceback +import tempfile +import shutil +import json +import time +from urllib.parse import urlparse, urljoin, unquote, parse_qs +from io import BytesIO +from bs4 import BeautifulSoup +import PyPDF2 +import requests +from PIL import Image +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError + +from app.utils import ( + get_random_user_agent, sizeof_fmt, get_domain, is_download_link, + normalize_download_url, detect_captcha, USER_AGENTS, STEALTH_SETTINGS, + PROXY_ROTATION_CONFIG +) + +logger = logging.getLogger(__name__) + +class DownloadManager: + def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): + self.use_proxy = use_proxy + self.proxy = proxy + self.query = query + self.num_results = num_results + self.playwright = None + self.browser = None + self.context = None + self.page = None + self.use_stealth = use_stealth + self.proxy_rotation = proxy_rotation + self.request_count = 0 + self.captcha_detected = False + self.download_timeout = 300 # 5 minutes timeout for downloads + # Track visited URLs to avoid revisiting the same URL multiple times + self.visited_urls = set() + # Track successfully downloaded files to avoid redownloading + self.downloaded_files = set() + + async def __aenter__(self): + self.playwright = await async_playwright().start() + + # Prepare browser args with stealth settings + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--no-zygote', + '--single-process', + '--disable-web-security', + '--disable-features=IsolateOrigins', + '--disable-site-isolation-trials' + ] + + # Add stealth-specific args + if self.use_stealth: + browser_args.extend([ + '--disable-blink-features=AutomationControlled', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-webgl', + '--disable-webrtc' + ]) + + # Setup browser options + opts = { + "headless": True, + "args": browser_args + } + + # Configure proxy if specified + if self.use_proxy and self.proxy: + opts["proxy"] = {"server": self.proxy} + + # Launch browser with options + self.browser = await self.playwright.chromium.launch(**opts) + + # Setup browser context with enhanced settings + context_opts = { + "user_agent": get_random_user_agent(), + "viewport": {"width": 1920, "height": 1080}, + "device_scale_factor": 1, + "has_touch": False, + "is_mobile": False, + "ignore_https_errors": True, + "accept_downloads": True + } + + # Apply stealth-specific settings to the context + if self.use_stealth: + # Apply JS-injection for enhanced stealth + context_opts["bypass_csp"] = True + self.context = await self.browser.new_context(**context_opts) + + # Execute stealth JS to avoid detection + await self.context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change navigator properties + const newProto = navigator.__proto__; + delete newProto.webdriver; + + // Overwrite the plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + // Handle languages more naturally + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + + // Modify deviceMemory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + + // WebGL modifications + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) { + return 'Intel Inc.'; + } + if (parameter === 37446) { + return 'Intel Iris OpenGL Engine'; + } + return getParameter.apply(this, arguments); + }; + } + """) + else: + # Regular context without stealth + self.context = await self.browser.new_context(**context_opts) + + # Create page with enhanced headers + self.page = await self.context.new_page() + await self.page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Cache-Control': 'max-age=0', + 'DNT': '1', # Do Not Track + 'Referer': 'https://www.google.com/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + }) + + # Add delay for mouse movements to simulate human behavior + if self.use_stealth: + await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) + await self.page.wait_for_timeout(random.randint(200, 500)) + + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + + async def rotate_proxy_if_needed(self): + """Rotate proxy if proxy rotation is enabled and threshold is reached""" + if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: + self.request_count += 1 + if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: + # Get next proxy from the pool + next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) + PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list + + # Close existing context and create new one with the new proxy + if self.context: + await self.context.close() + + # Create new context with the new proxy + context_opts = { + "user_agent": get_random_user_agent(), + "proxy": {"server": next_proxy}, + "accept_downloads": True + } + self.context = await self.browser.new_context(**context_opts) + self.page = await self.context.new_page() + + # Reset counter + self.request_count = 0 + logger.info(f"Rotated to new proxy: {next_proxy}") + + async def handle_captcha(self, page): + """Detect and handle captchas if possible""" + # Check for common captcha patterns + content = await page.content() + if detect_captcha(content): + self.captcha_detected = True + logger.warning("Captcha detected on page") + + # Strategies for handling captchas: + # 1. For simple captchas, try to extract the image and solve it + captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') + if captcha_img: + logger.info("Found captcha image, attempting to capture") + + # Take screenshot of the captcha + captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") + await captcha_img.screenshot(path=captcha_path) + + # In a real implementation, you would send this to a captcha solving service + # For now, just log the detection + logger.info(f"Captcha image saved to {captcha_path}") + + # For demonstration, we'll notify the user but not actually solve it + return False + + # 2. For reCAPTCHA, special handling would be required + recaptcha = await page.query_selector('iframe[src*="recaptcha"]') + if recaptcha: + logger.warning("reCAPTCHA detected, would require external solving service") + return False + + # 3. Try to perform human-like actions that might bypass simple bot checks + await self.perform_human_actions(page) + + # Check if captcha is still present + content = await page.content() + if detect_captcha(content): + logger.warning("Captcha still present after human-like actions") + return False + else: + logger.info("Captcha appears to be resolved") + return True + + return True # No captcha detected + + async def perform_human_actions(self, page): + """Perform human-like actions on the page to possibly bypass simple bot checks""" + try: + # 1. Slowly scroll down the page + for i in range(3): + await page.evaluate(f"window.scrollTo(0, {i * 300})") + await page.wait_for_timeout(random.randint(300, 700)) + + # 2. Random mouse movements + for _ in range(3): + x = random.randint(100, 800) + y = random.randint(100, 600) + await page.mouse.move(x=x, y=y) + await page.wait_for_timeout(random.randint(200, 500)) + + # 3. Click on a non-essential part of the page + try: + await page.click("body", position={"x": 50, "y": 50}) + except: + pass + + # 4. Wait a bit before continuing + await page.wait_for_timeout(1000) + + except Exception as e: + logger.warning(f"Error during human-like actions: {e}") + + async def search_bing(self): + urls = [] + try: + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + search_url = f"https://www.bing.com/search?q={self.query}" + await self.page.goto(search_url, timeout=30000) + await self.page.wait_for_load_state('networkidle') + + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected during search, results may be limited") + + # More natural scrolling behavior + for i in range(3): + await self.page.evaluate(f"window.scrollTo(0, {i * 400})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Extract search results + links = await self.page.query_selector_all("li.b_algo h2 a") + for link in links[:self.num_results]: + href = await link.get_attribute('href') + if href: + urls.append(href) + + # If we didn't find enough results, try an alternative selector + if len(urls) < self.num_results: + alt_links = await self.page.query_selector_all(".b_caption a") + for link in alt_links: + href = await link.get_attribute('href') + if href and href not in urls: + urls.append(href) + if len(urls) >= self.num_results: + break + + return urls + except Exception as e: + logger.error(f"Error searching Bing: {e}") + return [] + + async def get_file_size(self, url): + try: + await self.rotate_proxy_if_needed() + + # For complex download URLs, we need to be careful with HEAD requests + if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: + # For these URLs, we'll try a more reliable approach using range headers + headers = { + 'User-Agent': get_random_user_agent(), + 'Range': 'bytes=0-0' # Just request the first byte to check headers + } + + try: + with requests.get(url, headers=headers, stream=True, timeout=10) as r: + if 'Content-Range' in r.headers: + content_range = r.headers['Content-Range'] + match = re.search(r'bytes 0-0/(\d+)', content_range) + if match: + size = int(match.group(1)) + return sizeof_fmt(size) + + if 'Content-Length' in r.headers: + size = int(r.headers['Content-Length']) + # If size is 1, it's likely just our single requested byte + if size > 1: + return sizeof_fmt(size) + except Exception as e: + logger.warning(f"Error getting file size with Range request: {e}") + + # Fallback to browser approach + try: + async with self.context.new_page() as page: + response = await page.request.head(url, timeout=15000) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + except Exception as e: + logger.warning(f"Error getting file size with browser: {e}") + + return "Unknown Size" + else: + # Standard approach for normal URLs + async with self.context.new_page() as page: + response = await page.request.head(url, timeout=15000) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + else: + return "Unknown Size" + except Exception as e: + logger.warning(f"Error getting file size: {e}") + return "Unknown Size" + + async def get_pdf_metadata(self, url): + try: + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + resp = await page.request.get(url, timeout=15000) + if resp.ok: + content = await resp.body() + pdf = BytesIO(content) + reader = PyPDF2.PdfReader(pdf) + return { + 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', + 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', + 'Pages': len(reader.pages), + } + else: + return {} + except Exception as e: + logger.warning(f"Error reading PDF metadata: {e}") + return {} + + async def extract_real_download_url(self, url): + """Enhanced method to extract real download URL, handling complex URLs""" + try: + # Check if this is a complex download URL that needs special handling + if 'Action=downloadfile' in url or 'fname=' in url: + logger.info(f"Complex download URL detected: {url}") + + # For these special cases, we'll use the browser to navigate and intercept redirects + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + # Set up request interception to capture redirects + await page.route('**', lambda route: route.continue_()) + + # Listen for all responses + responses = [] + page.on('response', lambda response: responses.append(response)) + + try: + # Go to the URL + await page.goto(url, wait_until='networkidle', timeout=30000) + + # Check all responses for potential downloads + for response in responses: + # Look for content-disposition headers indicating a download + content_disposition = response.headers.get('Content-Disposition', '') + if 'attachment' in content_disposition or 'filename=' in content_disposition: + return response.url + + # Look for content-type headers indicating a file + content_type = response.headers.get('Content-Type', '') + if content_type and content_type != 'text/html' and not content_type.startswith('text/'): + return response.url + + # If no clear download was detected, return the final URL + return page.url + except Exception as e: + logger.warning(f"Error extracting real download URL: {e}") + return url + else: + # Standard approach for normal URLs + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + response = await page.goto(url, wait_until='networkidle', timeout=30000) + if response and response.headers.get('location'): + return response.headers['location'] + return page.url + except Exception as e: + logger.error(f"Error extracting real download URL: {e}") + return url + + # IMPROVED: Enhanced exam links extraction method + async def get_edu_exam_links(self, url): + """Specialized method for educational exam websites that follows a common pattern.""" + try: + logger.info(f"Fetching exam links from {url}") + links = set() + + # First try with direct requests for speed (but with proper headers) + headers = { + "User-Agent": get_random_user_agent(), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Referer": "https://www.google.com/", + "DNT": "1" + } + + try: + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code == 200: + # Parse with BeautifulSoup first for efficiency + soup = BeautifulSoup(response.text, "html.parser") + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Look for all links + for a in soup.find_all("a", href=True): + href = a["href"] + full_url = urljoin(url, href) + + # Look for text clues + link_text = a.get_text().lower() + + # Special patterns for exam sites (expanded list) + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_", + "/resource/", "/subject/", "/course/", "/material/" + ] + + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "subject", "course", "resource", "pdf", "document", + "view", "open", "get", "solution", "answer" + ] + + # Check URL for patterns + if any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + continue + + # Check link text for patterns + if any(pattern in link_text for pattern in text_patterns): + links.add(full_url) + continue + + # Check for common file extensions + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) + + # Check for download script parameters + if "Action=downloadfile" in url or "fname=" in url: + links.add(url) # Add the URL itself as it's a download link + except Exception as e: + logger.warning(f"Request-based extraction failed: {e}") + + # Browser-based approach for more thorough extraction or if initial approach was inadequate + try: + # Check if we need to proceed with browser-based extraction + if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: + logger.info("Using browser for enhanced link extraction") + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Navigate to the page with more natural timing + await self.page.goto(url, timeout=45000, wait_until='networkidle') + await self.page.wait_for_timeout(random.randint(1000, 2000)) + + # Handle captchas if present + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, extraction may be limited") + + # Get base URL for resolving relative links + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Perform natural scrolling to trigger lazy-loaded content + page_height = await self.page.evaluate("document.body.scrollHeight") + viewport_height = await self.page.evaluate("window.innerHeight") + + for scroll_pos in range(0, page_height, viewport_height // 2): + await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Scroll back to top + await self.page.evaluate("window.scrollTo(0, 0)") + await self.page.wait_for_timeout(500) + + # Extract all links with Playwright (better than just anchor tags) + all_links = await self.page.evaluate(""" + () => { + const results = []; + + // Get all anchor tags + const anchors = document.querySelectorAll('a[href]'); + for (const a of anchors) { + if (a.href) { + results.push({ + href: a.href, + text: a.innerText || a.textContent || '', + isButton: a.classList.contains('btn') || a.role === 'button' + }); + } + } + + // Get buttons that might contain links + const buttons = document.querySelectorAll('button'); + for (const btn of buttons) { + const onclick = btn.getAttribute('onclick') || ''; + if (onclick.includes('window.location') || onclick.includes('download')) { + results.push({ + href: '#button', + text: btn.innerText || btn.textContent || '', + isButton: true, + onclick: onclick + }); + } + } + + return results; + } + """) + + # Process the extracted links + for link_info in all_links: + href = link_info.get('href', '') + text = link_info.get('text', '').lower() + + if href and href != '#button': + # Check URL patterns + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_" + ] + + # Check text patterns + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "pdf", "document", "view", "open", "solution" + ] + + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(pattern in text for pattern in text_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + + # Check for download links in the page + download_links = await self.page.evaluate(""" + () => { + // Find all links that might be download links + const links = Array.from(document.querySelectorAll('a[href]')); + return links + .filter(a => { + const href = a.href.toLowerCase(); + return href.includes('download') || + href.includes('getfile') || + href.includes('view.php') || + href.includes('action=downloadfile') || + href.includes('fname='); + }) + .map(a => a.href); + } + """) + + for dl_link in download_links: + links.add(dl_link) + + # Check for ASP.NET specific elements that might contain exam links + grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') + for grid in grid_elements: + grid_links = await grid.query_selector_all('a[href]') + for a in grid_links: + href = await a.get_attribute('href') + text = await a.text_content() + + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + links.add(full_url) + + # Try clicking pagination controls to reveal more content + pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') + for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons + try: + # Check if this is a numeric pagination button (more likely to be useful) + button_text = await button.text_content() + if button_text and button_text.strip().isdigit(): + logger.info(f"Clicking pagination button: {button_text}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract links from this page + new_page_links = await self.page.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } + """) + + for href in new_page_links: + if href and not href.startswith('javascript:'): + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + except Exception as e: + logger.warning(f"Error clicking pagination button: {e}") + + # Try clicking any controls that might reveal more exam links (more focused approach) + show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') + for button in show_buttons: + button_text = (await button.text_content() or "").lower() + button_value = (await button.get_attribute("value") or "").lower() + button_id = (await button.get_attribute("id") or "").lower() + + # Look for buttons that seem likely to reveal file lists + promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", + "download", "resource", "material", "browse", "file"] + + if any(term in button_text or term in button_value or term in button_id + for term in promising_terms): + try: + logger.info(f"Clicking button: {button_text or button_value}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Get any new links that appeared + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + + # Focus on file extensions and patterns + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ + any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Special handling for ASP.NET PostBack links + try: + # Find and interact with ASP.NET __doPostBack elements + postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') + for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks + try: + onclick = await element.get_attribute('onclick') + if onclick and '__doPostBack' in onclick: + element_text = await element.text_content() + + # Only interact with elements that seem likely to contain exam links + promising_terms = ["show", "view", "list", "exam", "paper", "test", + "download", "resource", "material"] + + if any(term in element_text.lower() for term in promising_terms): + logger.info(f"Clicking ASP.NET postback element: {element_text}") + + # Click the element + await element.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract any new links + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) + except Exception as e: + logger.warning(f"Error interacting with postback element: {e}") + except Exception as e: + logger.warning(f"Error during postback handling: {e}") + + except Exception as e: + logger.error(f"Browser-based extraction failed: {e}") + + # Filter links to likely contain exam documents + filtered_links = [] + for link in links: + # Common file extensions for exam documents + if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + filtered_links.append(link) + continue + + # Common paths for exam documents + if any(pattern in link.lower() for pattern in [ + "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", + "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", + "/resource/", "/material/", "/notes/", "/subjectmaterial/" + ]): + filtered_links.append(link) + continue + + # Check for download links (these may not have obvious extensions) + if is_download_link(link): + filtered_links.append(link) + + logger.info(f"Found {len(filtered_links)} potential exam document links") + return filtered_links + + except Exception as e: + logger.error(f"Error getting exam links: {e}") + return [] + + async def discover_hidden_links(self, page): + """Discover hidden links that might be in JavaScript, iframes, or dynamic content""" + hidden_links = set() + + # Execute JavaScript to find links in script tags and data attributes + js_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Extract URLs from script tags + const scripts = document.querySelectorAll('script'); + for (const script of scripts) { + const content = script.textContent || ''; + const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; + for (let match of urlMatches) { + links.add(match.replace(/["']/g, '')); + } + } + + // Look for download-related variables in scripts + for (const script of scripts) { + const content = script.textContent || ''; + // Look for common patterns for file URLs in JavaScript + if (content.includes('downloadURL') || content.includes('fileURL') || + content.includes('pdfURL') || content.includes('documentURL')) { + + // Extract potential URLs + const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; + for (let match of potentialUrls) { + const url = match.replace(/["']/g, ''); + // Try to resolve relative URLs + if (url.startsWith('/') || !url.includes('://')) { + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else { + // Handle relative paths more carefully + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } else if (url.startsWith('http')) { + links.add(url); + } + } + } + } + + // Check for links in data attributes + const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); + for (const el of elements) { + for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { + const val = el.getAttribute(attr); + if (val) { + // Try to resolve relative URLs + if (val.startsWith('/')) { + links.add(window.location.origin + val); + } else if (val.startsWith('http')) { + links.add(val); + } else if (!val.startsWith('javascript:') && !val.startsWith('#')) { + // Handle relative paths + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + val); + } + } + } + } + + // Look for URLs in inline event handlers + const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); + for (const el of clickableElements) { + for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { + const val = el.getAttribute(attr); + if (val) { + // Check for JavaScript URLs with window.location + if (val.includes('window.location') || val.includes('document.location')) { + const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); + if (urlMatch && urlMatch[1]) { + const url = urlMatch[1]; + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else if (url.startsWith('http')) { + links.add(url); + } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } + } + + // Check for direct URLs in attributes + const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; + for (let match of urlMatches) { + links.add(match.replace(/["']/g, '')); + } + + // Check for download.php and similar patterns + if (val.includes('download.php') || val.includes('getfile.php') || + val.includes('Action=downloadfile') || val.includes('viewfile.php')) { + + // Handle both onclick handlers and direct hrefs + let url = ''; + if (attr === 'href') { + url = val; + } else { + // Extract URL from JavaScript + const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); + if (jsUrlMatch) { + url = jsUrlMatch[1]; + } + } + + // Resolve URL if needed + if (url) { + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else if (url.startsWith('http')) { + links.add(url); + } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } + } + } + } + } + + // Find PHP/ASP file download links + const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); + for (const link of fileLinks) { + links.add(link.href); + } + + return Array.from(links); + } + """) + + for link in js_links: + hidden_links.add(link) + + # Extract links from iframes + iframes = await page.query_selector_all('iframe') + for iframe in iframes: + try: + frame = await iframe.content_frame() + if frame: + iframe_links = await frame.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')) + .map(a => a.href) + .filter(href => href.startsWith('http')); + } + """) + for link in iframe_links: + hidden_links.add(link) + except Exception as e: + logger.warning(f"Could not extract links from iframe: {e}") + + # Look for links in shadow DOM (used in modern web components) + shadow_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Helper function to recursively process shadow roots + function processShadowRoot(root) { + if (!root) return; + + // Get links in this shadow root + const shadowLinks = root.querySelectorAll('a[href]'); + for (const link of shadowLinks) { + if (link.href && link.href.startsWith('http')) { + links.add(link.href); + } + } + + // Process nested shadow roots + const elements = root.querySelectorAll('*'); + for (const el of elements) { + if (el.shadowRoot) { + processShadowRoot(el.shadowRoot); + } + } + } + + // Find all shadow roots in the document + const elements = document.querySelectorAll('*'); + for (const el of elements) { + if (el.shadowRoot) { + processShadowRoot(el.shadowRoot); + } + } + + return Array.from(links); + } + """) + + for link in shadow_links: + hidden_links.add(link) + + # Look for download links in forms + form_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Check for form actions that might be download endpoints + const forms = document.querySelectorAll('form'); + for (const form of forms) { + const action = form.action || ''; + if (action && ( + action.includes('download') || + action.includes('getfile') || + action.includes('viewfile') || + action.includes('Action=downloadfile') + )) { + // Collect input values that might be needed for the download + const inputs = {}; + const formInputs = form.querySelectorAll('input[name]'); + for (const input of formInputs) { + inputs[input.name] = input.value; + } + + // Store both the form action and any important inputs + links.add(action); + } + } + + return Array.from(links); + } + """) + + for link in form_links: + hidden_links.add(link) + + return hidden_links + + async def extract_downloadable_files(self, url, custom_ext_list): + found_files = [] + try: + # Normalize the URL to handle special cases + normalized_url = normalize_download_url(url) + + # Skip if we've already visited this URL + if normalized_url in self.visited_urls: + logger.info(f"Skipping already visited URL: {normalized_url}") + return [] + + # Mark this URL as visited + self.visited_urls.add(normalized_url) + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # First check if this is a direct download link (Action=downloadfile or fname parameter) + if is_download_link(normalized_url): + logger.info(f"Processing potential direct download link: {normalized_url}") + + # Try to extract the real download URL if needed + real_url = await self.extract_real_download_url(normalized_url) + + # Determine filename - for complex URLs this can be tricky + filename = os.path.basename(urlparse(real_url).path) + + # Handle URL-encoded filenames + if '%' in filename: + try: + filename = unquote(filename) + except Exception: + pass + + # For URLs with download parameters, try to extract filename from query + if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): + # Look for file parameter + params = parse_qs(urlparse(normalized_url).query) + + # Check common filename parameters + for param in ['file', 'filename', 'name', 'fname', 'f']: + if param in params and params[param]: + potential_filename = params[param][0] + if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: + filename = os.path.basename(potential_filename) + break + + # If still no valid filename, use domain-based fallback + if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): + domain = get_domain(real_url) + # Try to determine file type from content-type or extension hints in URL + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in normalized_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Get file size + size_str = await self.get_file_size(real_url) + + # Add to found files + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': size_str, + 'metadata': {}, + 'download_url': normalized_url # Keep original URL for downloading + }) + + # For direct download links, we can return early + if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): + return found_files + + # Special handling for educational exam sites + if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in + ["exam", "test", "pastpaper", "eduexp"]): + logger.info("Using specialized handler for educational exam site") + + # Get direct links to exam files + exam_links = await self.get_edu_exam_links(url) + + for link in exam_links: + # Try to resolve any redirection + real_url = await self.extract_real_download_url(link) + filename = os.path.basename(urlparse(real_url).path) + + # If filename is URL encoded (common with Chinese/international sites) + if '%' in filename: + try: + filename = unquote(filename) + except Exception: + pass + + # If filename is empty or invalid, create a sensible one + if not filename or filename == '/': + domain = get_domain(real_url) + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in link.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Get file size + size_str = await self.get_file_size(real_url) + + # Get metadata for PDFs + meta = {} + if real_url.lower().endswith('.pdf'): + try: + meta = await self.get_pdf_metadata(real_url) + except Exception: + pass + + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': size_str, + 'metadata': meta, + 'download_url': link # Store original link for downloading + }) + + # If we found exam files with the specialized method, return them + if found_files: + return found_files + + # Standard extraction method if specialized method didn't find files + response = await self.page.goto(url, timeout=30000, wait_until='networkidle') + if not response: + return [] + + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, file extraction may be limited") + + # Scroll through the page naturally to trigger lazy loading + await self.page.evaluate(""" + (async () => { + const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const scrollStep = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += scrollStep) { + window.scrollTo(0, i); + await delay(100); + } + + window.scrollTo(0, 0); + })() + """) + await self.page.wait_for_timeout(1000) + + final_url = self.page.url + if '.php' in final_url or 'download' in final_url: + real_url = await self.extract_real_download_url(final_url) + if real_url != final_url: + # Try to detect the filename from headers or URL + response = await self.page.request.head(real_url, timeout=15000) + filename = None + + # Try to get from Content-Disposition header + content_disposition = response.headers.get('Content-Disposition', '') + if 'filename=' in content_disposition: + filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) + if filename_match: + filename = filename_match.group(1) + + # If not found in headers, use URL basename + if not filename: + filename = os.path.basename(urlparse(real_url).path) + if not filename or filename == '/': + # Generate a name based on domain + domain = get_domain(real_url) + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in real_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': await self.get_file_size(real_url), + 'metadata': {}, + 'download_url': final_url # Keep original URL for downloading + }) + return found_files + + await self.page.wait_for_load_state('networkidle', timeout=30000) + content = await self.page.content() + soup = BeautifulSoup(content, 'html.parser') + + default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', + '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', + '.pptx', '.odt', '.txt'] + all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) + + parsed_base = urlparse(final_url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + path_base = os.path.dirname(parsed_base.path) + + # Process all anchor tags + for a in soup.find_all('a', href=True): + href = a['href'].strip() + + if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + real_url = await self.extract_real_download_url(full_url) + if real_url and real_url != full_url: + found_files.append({ + 'url': real_url, + 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', + 'size': await self.get_file_size(real_url), + 'metadata': {}, + 'download_url': full_url # Original URL for download + }) + continue + + if any(href.lower().endswith(ext) for ext in all_exts): + file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + size_str = await self.get_file_size(file_url) + meta = {} + if file_url.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(file_url) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': file_url # Same as URL for direct links + }) + + # Handle Google Drive links + elif ("drive.google.com" in href) or ("docs.google.com" in href): + file_id = None + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, href) + if match: + file_id = match.group(1) + break + if file_id: + # Get file info to determine type and view-only status + file_type, is_view_only = await self.get_google_drive_file_info(file_id) + + # Create a more informative filename based on info + filename = f"gdrive_{file_id}" + if file_type: + filename = f"{filename}.{file_type}" + + size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") + + found_files.append({ + 'url': href, # Use original URL + 'filename': filename, + 'size': size_str, + 'metadata': { + 'view_only': is_view_only, + 'file_type': file_type, + 'file_id': file_id + }, + 'download_url': href # Same as URL for Google Drive + }) + + # Also check for files in other elements (iframe, embed, object, etc.) + other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) + for elem in other_elements: + src = elem.get('src') or elem.get('data') + if src and any(src.lower().endswith(ext) for ext in all_exts): + file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + size_str = await self.get_file_size(file_url) + meta = {} + if file_url.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(file_url) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': file_url + }) + + # Check for file links in onclick attributes + onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') + for elem in onclick_elements: + onclick = await elem.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url_match in urls: + if any(url_match.lower().endswith(ext) for ext in all_exts): + size_str = await self.get_file_size(url_match) + meta = {} + if url_match.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(url_match) + found_files.append({ + 'url': url_match, + 'filename': os.path.basename(url_match.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': url_match + }) + + # Also check for data-src and data-url attributes (common in lazy-loaded sites) + data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') + for elem in data_elements: + for attr in ['data-src', 'data-url', 'data-href', 'data-download']: + try: + value = await elem.get_attribute(attr) + if value and any(value.lower().endswith(ext) for ext in all_exts): + file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': await self.get_file_size(file_url), + 'metadata': {}, + 'download_url': file_url + }) + except: + pass + + # Check script tags for JSON data that might contain file URLs + script_elements = soup.find_all('script', type='application/json') + for script in script_elements: + try: + json_data = json.loads(script.string) + # Look for URL patterns in the JSON data + def extract_urls_from_json(obj, urls_found=None): + if urls_found is None: + urls_found = [] + if isinstance(obj, dict): + for k, v in obj.items(): + # Check if any key contains url-like terms + url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] + if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): + urls_found.append(v) + else: + extract_urls_from_json(v, urls_found) + elif isinstance(obj, list): + for item in obj: + extract_urls_from_json(item, urls_found) + return urls_found + + json_urls = extract_urls_from_json(json_data) + for json_url in json_urls: + if any(json_url.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': json_url, + 'filename': os.path.basename(json_url.split('?')[0]), + 'size': await self.get_file_size(json_url), + 'metadata': {}, + 'download_url': json_url + }) + except: + pass + + # Check for hidden download buttons or forms + hidden_elements = await self.page.evaluate(""" + () => { + const results = []; + + // Check for hidden forms with download actions + const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); + for (const form of forms) { + const action = form.getAttribute('action') || ''; + results.push({ + type: 'form', + action: action, + inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { + return {name: input.name, value: input.value}; + }) + }); + } + + // Check for hidden download links/buttons + const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { + const style = window.getComputedStyle(a); + return (style.display === 'none' || style.visibility === 'hidden') && + (a.href.includes('download') || a.href.includes('file')); + }); + + for (const link of hiddenLinks) { + results.push({ + type: 'link', + href: link.href, + text: link.innerText || link.textContent + }); + } + + return results; + } + """) + + # Process hidden elements + for elem in hidden_elements: + if elem['type'] == 'link' and 'href' in elem: + href = elem['href'] + if any(href.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': href, + 'filename': os.path.basename(href.split('?')[0]), + 'size': await self.get_file_size(href), + 'metadata': {}, + 'download_url': href + }) + + # Check for hidden links that might be in JavaScript, iframes, or dynamic content + hidden_links = await self.discover_hidden_links(self.page) + for link in hidden_links: + if any(link.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': link, + 'filename': os.path.basename(link.split('?')[0]), + 'size': await self.get_file_size(link), + 'metadata': {}, + 'download_url': link + }) + + # Deduplicate files by URL + seen_urls = set() + unique_files = [] + for f in found_files: + if f['url'] not in seen_urls: + seen_urls.add(f['url']) + unique_files.append(f) + + return unique_files + except Exception as e: + logger.error(f"Error extracting files from {url}: {e}") + traceback.print_exc() + return [] + + async def download_file(self, file_info, save_dir, referer): + file_url = file_info.get('download_url', file_info['url']) # Use download_url if available + fname = file_info['filename'] + path = os.path.join(save_dir, fname) + base, ext = os.path.splitext(fname) + counter = 1 + while os.path.exists(path): + path = os.path.join(save_dir, f"{base}_{counter}{ext}") + counter += 1 + os.makedirs(save_dir, exist_ok=True) + + # Check if we've already downloaded this file + if file_url in self.downloaded_files: + logger.info(f"File already downloaded: {file_url}") + return None + + try: + # Special handling for Google Drive files + if "drive.google.com" in file_url or "docs.google.com" in file_url: + # Check if it's marked as view-only in metadata + is_view_only = file_info.get('metadata', {}).get('view_only', False) + + # For view-only files, try our most robust approach first + if is_view_only: + logger.info(f"Attempting to download view-only file: {file_url}") + result_path = await self._force_download_viewonly(file_info, path) + if result_path: + self.downloaded_files.add(file_url) + return result_path + + # If that failed, try the regular download approach + logger.info("Primary method failed, trying fallback methods") + + # Try regular download methods + success = await self._download_from_google_drive(file_url, path) + if success: + self.downloaded_files.add(file_url) + return path + + # If all methods failed for Google Drive, try one last approach + logger.warning("All standard methods failed, attempting force download") + result_path = await self._force_download_viewonly(file_info, path) + if result_path: + self.downloaded_files.add(file_url) + return result_path if result_path else None + + # Special handling for complex download URLs + if 'Action=downloadfile' in file_url or 'fname=' in file_url: + logger.info(f"Using browser download approach for complex URL: {file_url}") + + # For these URLs, we'll need to navigate to the page and handle the download + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Navigate to the URL + await page.goto(file_url, timeout=60000) + + # Wait for the download to start + try: + download = await download_promise + await download.save_as(path) + + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.error(f"Browser download failed: {e}") + + # If download didn't start automatically, try to find and click download buttons + download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') + for button in download_buttons: + try: + await button.click() + try: + download = await download_promise + await download.save_as(path) + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except: + pass + except: + continue + + # If browser approach failed, try direct request as last resort + logger.info("Browser approach failed, trying direct request") + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Try with direct requests first (faster) + try: + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer, + 'DNT': '1' + } + + with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: + if response.status_code == 200: + # Check content type to verify it's not HTML/error page + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_url.endswith('.html'): + logger.warning(f"Received HTML instead of expected file: {file_url}") + else: + with open(path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file was downloaded correctly + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.warning(f"Direct download failed: {e}, trying browser approach") + + # Original code for non-Google Drive downloads using Playwright + async with self.context.new_page() as page: + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer + } + + # Try to download with timeout protection + try: + response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) + if response.status == 200: + content = await response.body() + with open(path, 'wb') as f: + f.write(content) + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + else: + logger.error(f"Download failed with status {response.status}: {file_url}") + + # Try to extract error information + error_info = await response.text() + logger.debug(f"Error response: {error_info[:200]}...") + + # Check if this might be a captcha or login issue + if detect_captcha(error_info): + logger.warning("Captcha detected during download") + # For HF Spaces, we can't implement browser-based captcha solving here + # Just log the issue for now + except PlaywrightTimeoutError: + logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") + + # Try an alternative approach - using the browser's download manager + try: + logger.info("Trying browser download manager approach") + download_promise = page.wait_for_event("download") + await page.goto(file_url, timeout=60000) + + # Wait for download to start (with timeout) + download = await download_promise + await download.save_as(path) + + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.error(f"Browser download manager approach failed: {e}") + + return None + except Exception as e: + logger.error(f"Error downloading {file_url}: {e}") + return None + + # IMPROVED: Split force_download_viewonly into smaller methods + async def _force_download_viewonly(self, file_info, save_path): + """Main method to handle view-only files, now simplified""" + # Extract the file ID + file_id = self._extract_drive_file_id(file_info) + if not file_id: + logger.error("Could not extract file ID") + return None + + # Get file type information + file_type = file_info.get('metadata', {}).get('file_type', 'pdf') + base, ext = os.path.splitext(save_path) + if not ext: + save_path = f"{base}.{file_type}" + + logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") + + # Create a stealth browser for handling the download + browser = await self._create_stealth_browser() + + try: + # Set up the browser page + page = await browser.new_page() + + # Go to the file view page + logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) + await page.wait_for_load_state('networkidle') + + # Check for permission issues + content = await page.content() + if "the owner has not granted you permission to" in content: + logger.warning("Permission denied error detected") + return None + + # Wait for the page to stabilize + await page.wait_for_timeout(random.randint(3000, 7000)) + + # Create temp directory for working files + temp_dir = tempfile.mkdtemp() + + # Handle different file types + if file_type.lower() == 'pdf': + return await self._download_viewonly_pdf(page, file_id, save_path, temp_dir) + else: + return await self._download_viewonly_other(page, file_id, file_type, save_path, temp_dir) + + except Exception as e: + logger.error(f"Error during force download: {e}") + return None + finally: + await browser.close() + + def _extract_drive_file_id(self, file_info): + """Extract Google Drive file ID from file info""" + # Try to get file ID from metadata + file_id = file_info.get('metadata', {}).get('file_id') + if file_id: + return file_id + + # If not in metadata, try to extract from URL + url = file_info.get('url', '') + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, url) + if match: + return match.group(1) + + return None + + async def _create_stealth_browser(self): + """Create a stealth browser instance for handling sensitive downloads""" + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-blink-features=AutomationControlled' # Anti-detection + ] + + browser = await self.playwright.chromium.launch( + headless=True, + args=browser_args + ) + + # Use higher resolution for better quality + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + device_scale_factor=2.0, + accept_downloads=True # Critical for the download workflow + ) + + # Add anti-detection script + await context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + // Handle languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + } + """) + + return browser + + async def _download_viewonly_pdf(self, page, file_id, save_path, temp_dir): + """Handle downloading view-only PDF files""" + try: + # Estimate number of pages + estimated_pages = await page.evaluate(""" + () => { + // Method 1: Check page counter text + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); + }); + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // Method 2: Check actual page elements + const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pageElements.length > 0) return pageElements.length; + + // Method 3: Look for page thumbnails + const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); + if (thumbnails.length > 0) return thumbnails.length; + + // Fallback: conservative guess + return 50; + } + """) + + logger.info(f"Estimated {estimated_pages} pages in PDF") + + # Initial scroll to trigger lazy loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Scroll page by page to ensure all pages are loaded + logger.info("Scrolling page by page...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + + while attempt < max_attempts: + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) + + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + + if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): + logger.info("All pages appear to be loaded.") + break + + # Alternate between PageDown and End keys for more natural scrolling + if attempt % 3 == 0: + await page.keyboard.press("End") + else: + await page.keyboard.press("PageDown") + + # Randomized wait times + await page.wait_for_timeout(random.randint(1500, 3000)) + + # Move mouse randomly to appear more human-like + if attempt % 4 == 0: + await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is loaded + await page.wait_for_timeout(5000) + + # Set up download event listener for the PDF + download_promise = page.wait_for_event("download") + + # Use jsPDF to generate PDF from loaded pages + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = Array.from(document.getElementsByTagName("img")) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); + + console.log(`Found ${imgs.length} valid page images to add to PDF`); + + let added = 0; + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; + } + + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); + } + }; + + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success', False): + logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") + + # Try fallback approach - screenshot method + logger.info("Trying fallback screenshot method...") + return await self._pdf_screenshot_fallback(page, estimated_pages, save_path, temp_dir) + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Wait for the download and save it + download = await download_promise + await download.save_as(save_path) + + # Clean up temp directory + try: + os.rmdir(temp_dir) + except: + pass + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: + logger.info(f"Successfully downloaded PDF to {save_path}") + return save_path + else: + logger.error(f"Generated file is too small or missing: {save_path}") + return None + + except Exception as e: + logger.error(f"Error in PDF download: {e}") + return None + + async def _pdf_screenshot_fallback(self, page, estimated_pages, save_path, temp_dir): + """Fallback method using screenshots for PDF creation""" + try: + # Navigate back to the first page + await page.evaluate(""" + () => { + // Find and click the "first page" button if available + const buttons = Array.from(document.querySelectorAll('button')); + const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); + if (firstPageBtn) firstPageBtn.click(); + } + """) + await page.wait_for_timeout(1000); + + # Create a PDF by taking screenshots of each page + screenshots = [] + current_page = 1 + max_pages = estimated_pages + + # Create a PDF using the reportlab package + while current_page <= max_pages: + screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") + + # Try to find the current page element + page_elem = await page.query_selector('.drive-viewer-paginated-page') + if page_elem: + await page_elem.screenshot(path=screenshot_path) + else: + # Fallback to full page screenshot + await page.screenshot(path=screenshot_path) + + screenshots.append(screenshot_path) + + # Try to navigate to next page + next_btn = await page.query_selector('button[aria-label="Next page"]') + if next_btn: + is_disabled = await next_btn.get_attribute('disabled') + if is_disabled: + logger.info(f"Reached end of document at page {current_page}") + break + + await next_btn.click() + await page.wait_for_timeout(1000) + current_page += 1 + else: + break + + # Create PDF from screenshots + if screenshots: + first_img = Image.open(screenshots[0]) + width, height = first_img.size + + c = canvas.Canvas(save_path, pagesize=(width, height)) + for screenshot in screenshots: + img = Image.open(screenshot) + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + c.save() + + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) + + return save_path + + return None + except Exception as e: + logger.error(f"Error in screenshot fallback: {e}") + return None + + async def _download_viewonly_other(self, page, file_id, file_type, save_path, temp_dir): + """Handle downloading non-PDF view-only files""" + try: + # Take a screenshot of the file + screenshot_path = os.path.join(temp_dir, "file.png") + await page.screenshot(path=screenshot_path) + + if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: + # For document types, try to export directly + success = await self._export_google_doc(file_id, file_type, save_path) + if success: + os.remove(screenshot_path) + return save_path + + # If export fails, fall back to screenshot + logger.warning(f"Export failed, falling back to screenshot for {file_type}") + + # For other types or if export failed, save the screenshot with appropriate extension + shutil.copy(screenshot_path, save_path) + os.remove(screenshot_path) + + return save_path if os.path.exists(save_path) else None + + except Exception as e: + logger.error(f"Error in non-PDF download: {e}") + return None + + async def _download_from_google_drive(self, url, save_path): + """Enhanced method to download from Google Drive with multiple fallback approaches""" + # Extract the file ID from different URL formats + file_id = self._extract_drive_file_id({"url": url}) + if not file_id: + logger.error(f"Could not extract file ID from URL: {url}") + return False + + # Determine file type first (important for handling different file types) + file_type, is_view_only = await self._get_google_drive_file_info(file_id) + logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") + + base, ext = os.path.splitext(save_path) + if not ext and file_type: + # Add the correct extension if missing + save_path = f"{base}.{file_type}" + + # For view-only files, use specialized approaches + if is_view_only: + # Approach 1: For PDFs, use the JS method + if file_type == 'pdf': + success = await self._download_viewonly_pdf_with_js(file_id, save_path) + if success: + return True + + # Approach 2: For Google Docs, Sheets, etc., use export API + if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: + success = await self._export_google_doc(file_id, file_type, save_path) + if success: + return True + + # Fallback to the main view-only method + result_path = await self._force_download_viewonly({ + 'url': url, + 'metadata': {'file_id': file_id, 'file_type': file_type, 'view_only': True} + }, save_path) + + return bool(result_path) + + # Try standard approaches for non-view-only files + try: + # Try direct download link first (fastest) + direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" + + # Add anti-bot headers + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://drive.google.com/', + 'DNT': '1' + } + + # Try with streaming to handle larger files + with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: + if r.status_code == 200: + # Check if we got HTML instead of the file + content_type = r.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_id.endswith('.html'): + logger.warning("Received HTML instead of file, trying with session cookies") + else: + # Looks like we got the actual file + with open(save_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + logger.info("Direct download successful") + return True + + # Try browser-based approach as last resort + try: + async with self.context.new_page() as page: + # Visit the file view page first to get cookies + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + await page.wait_for_timeout(3000) + + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Try to trigger the download button click + download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') + if download_button: + await download_button.click() + + # Wait for download to start + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except Exception as e: + logger.error(f"Error during browser download: {e}") + return False + else: + # Try the export download URL + await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) + + # Look for and click any download buttons or links + download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') + for elem in download_elements: + try: + await elem.click() + # Wait a bit to see if download starts + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except: + pass + except: + continue + except Exception as e: + logger.error(f"Browser-based download attempt failed: {e}") + + logger.warning("All standard download methods failed") + return False + except Exception as e: + logger.error(f"Error in Google Drive download: {e}") + return False + + async def _download_viewonly_pdf_with_js(self, file_id, save_path): + """Download view-only PDF using blob images and JS""" + try: + # Create a dedicated browser instance + browser = await self._create_stealth_browser() + page = await browser.new_page() + + try: + # Navigate to the file with human-like behavior + logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) + await page.wait_for_load_state('networkidle') + + # Perform human-like interactions + await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) + await page.wait_for_timeout(random.randint(2000, 5000)) + + # Estimate the number of pages + estimated_pages = await page.evaluate(""" + () => { + // Look for page counter in the interface + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); + }); + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // If we can't find a counter, check actual pages + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length > 0) return pages.length; + + // Default to a reasonable number if we can't determine + return 50; + } + """) + + logger.info(f"Estimated number of pages: {estimated_pages}") + + # Initial scroll to trigger loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Scroll through document with variety to appear natural + await self._natural_scroll_through_document(page, estimated_pages) + + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Use jsPDF to generate PDF from loaded pages + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = Array.from(document.getElementsByTagName("img")) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); + + console.log(`Found ${imgs.length} valid page images to add to PDF`); + + let added = 0; + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; + } + + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); + } + }; + + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success'): + logger.error(f"Error in PDF generation: {result.get('error')}") + return False + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Wait for the download to complete and save the file + download = await download_promise + + # Save the downloaded file to the specified path + await download.save_as(save_path) + logger.info(f"Successfully saved PDF to {save_path}") + + return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 + + finally: + await browser.close() + + except Exception as e: + logger.error(f"Error in viewonly PDF download process: {e}") + return False + + async def _natural_scroll_through_document(self, page, estimated_pages): + """Scroll through document in a natural way to load all pages""" + logger.info("Scrolling through document to load all pages...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + consecutive_same_count = 0 + + while attempt < max_attempts: + # Count blob images (which are the PDF pages) + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) + + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + + # Check if we've loaded all pages or if we're stuck + if blob_count >= estimated_pages: + logger.info(f"All {estimated_pages} pages appear to be loaded.") + break + + if blob_count == prev_blob_count: + consecutive_same_count += 1 + if consecutive_same_count >= 5 and blob_count > 0: + logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") + break + else: + consecutive_same_count = 0 + + # Mix up the scrolling approach for more human-like behavior + scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) + + if scroll_action == "PageDown": + await page.keyboard.press("PageDown") + elif scroll_action == "End": + await page.keyboard.press("End") + elif scroll_action == "ArrowDown": + # Press arrow down multiple times + for _ in range(random.randint(5, 15)): + await page.keyboard.press("ArrowDown") + await page.wait_for_timeout(random.randint(50, 150)) + else: # mouse + # Scroll using mouse wheel + current_y = random.randint(300, 700) + await page.mouse.move(x=random.randint(300, 800), y=current_y) + await page.mouse.wheel(0, random.randint(300, 800)) + + # Random wait between scrolls + await page.wait_for_timeout(random.randint(1000, 3000)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is fully loaded + await page.wait_for_timeout(5000) + + async def _export_google_doc(self, file_id, file_type, save_path): + """Export Google Docs/Sheets/Slides to downloadable formats""" + try: + # Map file types to export formats + export_urls = { + 'doc': f"https://docs.google.com/document/d/{file_id}/export?format=doc", + 'docx': f"https://docs.google.com/document/d/{file_id}/export?format=docx", + 'sheet': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", + 'xlsx': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", + 'ppt': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", + 'pptx': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", + 'pdf': f"https://docs.google.com/document/d/{file_id}/export?format=pdf" + } + + export_url = export_urls.get(file_type, f"https://docs.google.com/document/d/{file_id}/export?format=pdf") + + async with self.context.new_page() as page: + # Get cookies from the main view page first + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') + + # Now try the export + response = await page.goto(export_url, wait_until='networkidle') + + if response.status == 200: + content = await response.body() + with open(save_path, 'wb') as f: + f.write(content) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + else: + logger.warning(f"Export failed with status {response.status}") + return False + + except Exception as e: + logger.error(f"Error exporting Google Doc: {e}") + return False + + async def _get_google_drive_file_info(self, file_id): + """Get file type and view-only status from Google Drive""" + file_type = None + is_view_only = False + + try: + async with self.context.new_page() as page: + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + + # Check if view-only + view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') + is_view_only = view_only_text is not None + + # Check for Google Docs viewer + gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') + gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') + gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') + + if gdocs_viewer: + file_type = 'docx' + elif gsheets_viewer: + file_type = 'xlsx' + elif gslides_viewer: + file_type = 'pptx' + else: + # Check for PDF viewer + pdf_viewer = await page.query_selector('embed[type="application/pdf"]') + if pdf_viewer: + file_type = 'pdf' + else: + # Check for image viewer + img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') + if img_viewer: + # Get image type from src + img_src = await img_viewer.get_attribute('src') + if 'jpg' in img_src or 'jpeg' in img_src: + file_type = 'jpg' + elif 'png' in img_src: + file_type = 'png' + else: + file_type = 'jpg' # Default to jpg + else: + # Generic file type fallback + file_type = 'pdf' # Default to PDF + + # If still no type, check filename + if not file_type: + title_element = await page.query_selector('div[role="heading"]') + if title_element: + title = await title_element.text_content() + if title: + ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) + if ext_match: + file_type = ext_match.group(1).lower() + + except Exception as e: + logger.error(f"Error getting Google Drive file info: {e}") + file_type = 'pdf' # Default to PDF if we can't determine + + return file_type, is_view_only + + # IMPROVED: Enhanced sublink extraction method + async def get_sublinks(self, url, limit=10000): + """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" + links = set() + try: + logger.info(f"Fetching sublinks from: {url}") + + # Check if this is a direct download link + if is_download_link(url): + logger.info(f"URL appears to be a direct download link: {url}") + links.add(url) + return list(links)[:limit] + + # Skip if we've already visited this URL + normalized_url = normalize_download_url(url) + if normalized_url in self.visited_urls: + logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") + return list(links)[:limit] + + # Add to visited URLs + self.visited_urls.add(normalized_url) + + # Special handling for educational sites like phsms.cloud.ncnu.edu.tw + if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in + ["exam", "test", "pastpaper", "eduexp"]): + logger.info("Using specialized exam site sublink extraction") + edu_links = await self.get_edu_exam_links(url) + for link in edu_links: + links.add(link) + + # If we found a good number of links with the specialized method, return them + if len(links) > 5: + logger.info(f"Found {len(links)} sublinks with specialized method") + return list(links)[:limit] + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Standard sublink extraction for all sites + try: + await self.page.goto(url, timeout=30000, wait_until='networkidle') + except Exception as e: + logger.warning(f"Error navigating to URL for sublink extraction: {e}") + # Continue with what we have, we'll try to extract links anyway + + # Get base URL for resolving relative links + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + path_base = os.path.dirname(parsed_base.path) + + # Perform initial scrolling to load lazy content + await self.page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const step = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += step) { + window.scrollTo(0, i); + await delay(150); + } + + window.scrollTo(0, 0); + } + """) + await self.page.wait_for_timeout(1000) + + # Check if page has ASP.NET elements which might need special handling + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') + + if is_aspnet: + logger.info("Detected ASP.NET page, using enhanced extraction method") + + # Try to interact with ASP.NET controls that might reveal more links + # Look for dropdowns, buttons, and grid elements + dropdowns = await self.page.query_selector_all('select') + buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') + + # Try interacting with dropdowns first + for dropdown in dropdowns: + try: + # Get all options + options = await self.page.evaluate(''' + (dropdown) => { + return Array.from(dropdown.options).map(o => o.value); + } + ''', dropdown) + + # Try selecting each option + for option in options: + if option: + await dropdown.select_option(value=option) + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error interacting with dropdown: {e}") + + # Try clicking buttons (but avoid dangerous ones like "delete") + safe_buttons = [] + for button in buttons: + button_text = await button.text_content() or "" + button_value = await button.get_attribute("value") or "" + button_id = await button.get_attribute("id") or "" + combined_text = (button_text + button_value + button_id).lower() + + # Skip potentially destructive buttons + if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): + continue + + # Prioritize buttons that might show more content + if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): + safe_buttons.append(button) + + # Click the safe buttons + for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks + try: + await button.click() + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Extract links from the initial page state + await self.extract_all_link_types(links, base_url, path_base) + + # Look specifically for links inside grid/table views which are common in ASP.NET applications + grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') + for cell in grid_cells: + try: + href = await cell.get_attribute('href') + if href: + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links.add(full_url) + except Exception as e: + logger.warning(f"Error extracting grid link: {e}") + + # Extract links from onclick attributes and javascript:__doPostBack calls + postback_links = await self.page.evaluate(''' + () => { + const results = []; + // Find elements with onclick containing __doPostBack + const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); + for (const el of elements) { + // Extract the postback target + const onclick = el.getAttribute('onclick') || ''; + const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); + if (match && match[1]) { + // Get the visible text to use as description + const text = el.innerText || el.textContent || 'Link'; + results.push({ + id: match[1], + text: text.trim() + }); + } + } + return results; + } + ''') + + # Try interacting with some of the postback links + for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions + try: + logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") + await self.page.evaluate(f''' + () => {{ + if (typeof __doPostBack === 'function') {{ + __doPostBack('{postback["id"]}', ''); + }} + }} + ''') + await self.page.wait_for_timeout(1500) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error with postback: {e}") + + # Look for pagination controls and try to navigate through them + pagination_elements = await self.page.query_selector_all( + 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' + ) + + # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops) + for i in range(min(5, len(pagination_elements))): + try: + # Focus on elements that look like "next page" buttons + el = pagination_elements[i] + el_text = await el.text_content() or "" + + # Only click if this looks like a pagination control + if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip(): + logger.info(f"Clicking pagination control: {el_text}") + await el.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Get new links from this page + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking pagination: {e}") + + # Check for hidden links that might be revealed by JavaScript + hidden_links = await self.page.evaluate(""" + () => { + // Try to execute common JavaScript patterns that reveal hidden content + try { + // Common patterns used in websites to initially hide content + const hiddenContainers = document.querySelectorAll( + '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' + ); + + // Attempt to make them visible + hiddenContainers.forEach(el => { + el.style.display = 'block'; + el.style.visibility = 'visible'; + el.classList.remove('hidden', 'hide'); + }); + + // Return any newly visible links + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } catch (e) { + return []; + } + } + """) + + # Add any newly discovered links + for href in hidden_links: + if href and not href.startswith('javascript:'): + links.add(href) + + # Find all download links + download_links = await self.page.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')) + .filter(a => { + const href = a.href.toLowerCase(); + return href.includes('download') || + href.includes('file') || + href.includes('get') || + href.includes('view.php') || + href.includes('action=') || + href.includes('fname='); + }) + .map(a => a.href); + } + """) + + for download_link in download_links: + links.add(download_link) + + # Also check for hidden links in JavaScript, iframes, or dynamic content + js_links = await self.discover_hidden_links(self.page) + for link in js_links: + links.add(link) + + logger.info(f"Found {len(links)} sublinks") + + # Prioritize download links + prioritized_links = [] + normal_links = [] + + for link in links: + if is_download_link(link): + prioritized_links.append(link) + else: + normal_links.append(link) + + # Return prioritized links first, then normal links, up to the limit + result = prioritized_links + normal_links + return result[:limit] + + except Exception as e: + logger.error(f"Error getting sublinks from {url}: {e}") + return list(links)[:limit] # Return what we have so far + + async def extract_all_link_types(self, links_set, base_url, path_base): + """Extract all types of links from the current page""" + # Get all tag links + a_links = await self.page.query_selector_all('a[href]') + for a in a_links: + try: + href = await a.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get iframe sources + iframes = await self.page.query_selector_all('iframe[src]') + for iframe in iframes: + try: + src = await iframe.get_attribute('src') + if src and not src.startswith('javascript:') and not src.startswith('about:'): + full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get links from onclick attributes that reference URLs + onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') + for el in onclick_elements: + try: + onclick = await el.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url in urls: + links_set.add(url) + except Exception: + pass + + # Look for URLs in data-* attributes + data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') + for el in data_elements: + for attr in ['data-url', 'data-href', 'data-src']: + try: + value = await el.get_attribute(attr) + if value and not value.startswith('javascript:'): + full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Look for special anchor links that might not have href attributes + special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') + for anchor in special_anchors: + try: + href = await anchor.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Extract links from JSON data embedded in the page + script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') + for script in script_elements: + try: + script_content = await script.text_content() + if script_content: + # Look for URLs in the JSON content + urls = re.findall(r'(https?://[^\'"]+)', script_content) + for url in urls: + links_set.add(url) + except Exception: + pass + + def resolve_relative_url(self, relative_url, base_url, path_base): + """Properly resolve relative URLs considering multiple formats""" + if relative_url.startswith('/'): + # Absolute path relative to domain + return f"{base_url}{relative_url}" + elif relative_url.startswith('./'): + # Explicit relative path + return f"{base_url}{path_base}/{relative_url[2:]}" + elif relative_url.startswith('../'): + # Parent directory + parent_path = '/'.join(path_base.split('/')[:-1]) + return f"{base_url}{parent_path}/{relative_url[3:]}" + else: + # Regular relative path + return f"{base_url}{path_base}/{relative_url}" + + async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): + """Perform a deep search for files at the URL and its sublinks""" + import streamlit as st + + if not custom_ext_list: + custom_ext_list = [] + progress_text = st.empty() + progress_bar = st.progress(0) + file_count_text = st.empty() + + try: + # Reset the visited URLs for a fresh deep search + self.visited_urls = set() + + progress_text.text("🔍 Analyzing main page...") + # Special handling for ASP.NET pages + is_aspnet = False + try: + await self.page.goto(url, timeout=30000, wait_until='networkidle') + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') + except Exception: + pass + + # Check if this URL is a direct download + if is_download_link(url): + progress_text.text("📥 URL appears to be a direct download. Processing...") + + # Try to extract file directly + normalized_url = normalize_download_url(url) + file_info = { + 'url': normalized_url, + 'download_url': normalized_url, + 'filename': os.path.basename(urlparse(normalized_url).path) or 'download', + 'size': 'Unknown Size', + 'metadata': {} + } + + # Add to visited URLs + self.visited_urls.add(normalized_url) + progress_bar.progress(1.0) + return [file_info] + + # Extract files from main page + progress_text.text("📄 Extracting files from main page...") + main_files = await self.extract_downloadable_files(url, custom_ext_list) + initial_count = len(main_files) + file_count_text.text(f"Found {initial_count} files on main page") + + # Get sublinks with enhanced method + progress_text.text("🔗 Getting sublinks...") + sublinks = await self.get_sublinks(url, sublink_limit) + total_links = len(sublinks) + progress_text.text(f"Found {total_links} sublinks to process") + + # Always include files from the main page, regardless of sublinks + all_files = main_files + + if not sublinks: + progress_bar.progress(1.0) + return all_files + + # Process each sublink + for i, sublink in enumerate(sublinks, 1): + progress = i / total_links + progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") + progress_bar.progress(progress) + + try: + # Check if this is a direct download link + if is_download_link(sublink): + # For download links, just add the link directly + normalized_url = normalize_download_url(sublink) + + # Skip if already visited + if normalized_url in self.visited_urls: + continue + + # Mark as visited + self.visited_urls.add(normalized_url) + + # Get file size if possible + size_str = await self.get_file_size(normalized_url) + + # Get filename, with fallback to domain-based name + filename = os.path.basename(urlparse(normalized_url).path) + if not filename or filename == '/' or '?' in filename: + domain = get_domain(normalized_url) + ext = '.pdf' # Default extension + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: + if common_ext in normalized_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Add file to results + all_files.append({ + 'url': normalized_url, + 'download_url': normalized_url, + 'filename': filename, + 'size': size_str, + 'metadata': {} + }) + file_count_text.text(f"Found {len(all_files)} total files") + continue + + # For regular links, use a longer timeout for ASP.NET pages which can be slower + sub_timeout = timeout * 2 if is_aspnet else timeout + + # Skip already visited URLs + if sublink in self.visited_urls: + continue + + # Extract files from sublink + sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) + all_files.extend(sub_files) + file_count_text.text(f"Found {len(all_files)} total files") + except Exception as e: + logger.warning(f"Error processing sublink {sublink}: {e}") + + # Deduplicate files + seen_urls = set() + unique_files = [] + for f in all_files: + if f['url'] not in seen_urls: + seen_urls.add(f['url']) + unique_files.append(f) + + final_count = len(unique_files) + progress_text.text(f"✅ Deep search complete!") + file_count_text.text(f"Found {final_count} unique files") + progress_bar.progress(1.0) + return unique_files + + except Exception as e: + logger.error(f"Deep search error: {e}") + progress_text.text(f"⚠️ Error during deep search: {str(e)}") + return [] + + finally: + await asyncio.sleep(2) + if not st.session_state.get('keep_progress', False): + progress_text.empty() + progress_bar.empty() \ No newline at end of file