diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,51 +1,45 @@ import streamlit as st +st.set_page_config(page_title="Advanced File Downloader", layout="wide") + +# Core imports import os -import asyncio import subprocess -import tempfile +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError +import asyncio import logging -import time -import json -import base64 +from urllib.parse import urlparse, urljoin, unquote import re +from pathlib import Path +from io import BytesIO import random +from bs4 import BeautifulSoup +from PyPDF2 import PdfReader import zipfile +import tempfile +import mimetypes +import requests import datetime import traceback +import base64 import shutil -import mimetypes -from pathlib import Path -from urllib.parse import urlparse, urljoin, unquote -from io import BytesIO +import json +import time from PIL import Image from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas - -# Advanced imports -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError -from bs4 import BeautifulSoup -from PyPDF2 import PdfReader import google_auth_oauthlib.flow import googleapiclient.discovery import google.auth.transport.requests import googleapiclient.http -import requests -import celery -from celery import Celery -import splash -import pyppeteer -import mitmproxy -from mitmproxy import http -# Configure page and logging -st.set_page_config(page_title="Advanced File Downloader", layout="wide") -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +# -------------------- Logging Setup -------------------- +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) logger = logging.getLogger(__name__) -# Initialize Celery for distributed task processing -celery_app = Celery('file_downloader', broker='redis://localhost:6379/0') - -# Configure Google OAuth +# -------------------- Google OAuth Config -------------------- GOOGLE_OAUTH_CONFIG = { "web": { "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", @@ -58,7 +52,8 @@ GOOGLE_OAUTH_CONFIG = { } } -# -------------------- User Agent Settings -------------------- +# -------------------- Stealth and UA Settings -------------------- +# Extended user agent list for better variety USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', @@ -67,18 +62,30 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' ] -# -------------------- Proxy Management -------------------- -PROXY_POOL = [] -CURRENT_PROXY_INDEX = 0 +# Stealth browser settings +STEALTH_SETTINGS = { + # Hardware features to modify/disable + "hardware_concurrency": 4, + "device_memory": 8, + # Browser features to enable/disable + "webgl_vendor": "Google Inc. (Intel)", + "webgl_renderer": "Intel Iris OpenGL Engine", + "languages": ["en-US", "en"], + "disable_webrtc": True, + # Additional timing randomization + "navigator_platform": "Win32", + "touch_support": False +} -# -------------------- Network Interception Configuration -------------------- -NETWORK_INTERCEPTOR_CONFIG = { - "enabled": False, - "intercept_types": ["xhr", "fetch", "document", "media"], - "save_intercepted": True, - "intercept_folder": "./intercepted_data" +# Proxy rotation configuration (if using multiple proxies) +PROXY_ROTATION_CONFIG = { + "enabled": False, # Set to True to enable rotation + "rotation_interval": 10, # Rotate every 10 requests + "proxies": [] # Will be populated from the UI if needed } # -------------------- Utility Functions -------------------- @@ -108,6 +115,16 @@ def get_file_extension(url, default='.pdf'): return default return ext +def humanize_file_size(size_bytes): + """Format file size in human-readable format""" + if size_bytes < 1024: + return f"{size_bytes} bytes" + for unit in ['KB', 'MB', 'GB', 'TB']: + size_bytes /= 1024.0 + if size_bytes < 1024.0: + return f"{size_bytes:.1f} {unit}" + return f"{size_bytes:.1f} PB" + def get_domain(url): """Extract domain from URL""" parsed = urlparse(url) @@ -117,6 +134,15 @@ def is_valid_file_url(url, extensions): """Check if URL is a valid file URL based on extension""" return any(url.lower().endswith(ext) for ext in extensions) +def detect_captcha(html_content): + """Detect common captcha patterns in HTML content""" + captcha_patterns = [ + 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', + 'challenge', 'solve the following', 'verify you are human' + ] + html_lower = html_content.lower() + return any(pattern in html_lower for pattern in captcha_patterns) + # -------------------- Google Drive Functions -------------------- def get_google_auth_url(): client_config = GOOGLE_OAUTH_CONFIG["web"] @@ -167,507 +193,394 @@ def create_drive_folder(drive_service, name): folder = drive_service.files().create(body=folder_metadata, fields='id').execute() return folder.get('id') -# -------------------- Setup Functions -------------------- -def setup_dependencies(): - """Install required system dependencies""" +# -------------------- Playwright Setup -------------------- +def install_playwright_dependencies(): try: - # Check if browsers are already installed instead of installing them - if os.path.exists(os.path.join(os.environ.get('PLAYWRIGHT_BROWSERS_PATH', ''), 'chromium-1045')): - logger.info("Playwright browsers already installed, skipping installation") - installed = True - else: - # Only try to install browsers if they don't exist already - try: - subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], - check=True, env=os.environ) - installed = True - except subprocess.CalledProcessError as e: - logger.warning(f"Could not install browsers: {e}") - installed = False + # Set environment variable for Playwright browsers path + os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") - # Skip system dependency installation in container environment - if os.path.exists('/.dockerenv'): - return installed - - # System packages installation + # Install system dependencies subprocess.run(['apt-get', 'update', '-y'], check=True) packages = [ 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', - 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0', - 'redis-server', 'python3-dev', 'build-essential' + 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' ] subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) - # Install Python packages - only if not in Docker - if not os.path.exists('/.dockerenv'): - subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True) + # Install Playwright and dependencies + subprocess.run(['pip', 'install', 'playwright'], check=True) + subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) - st.success("Dependencies installed successfully!") - return True + st.success("Playwright dependencies installed successfully!") except Exception as e: - st.error(f"Error installing dependencies: {e}") + st.error(f"Error installing Playwright dependencies: {e}") st.info("You may need to manually install dependencies. Check console for details.") - logger.error(f"Setup error: {e}") + logger.error(f"Playwright setup error: {e}") traceback.print_exc() - return False - -def check_services(): - """Check if required services are running""" - try: - # Check Redis for Celery - redis_running = False - try: - redis_running = subprocess.run(['redis-cli', 'ping'], - capture_output=True, - text=True).stdout.strip() == 'PONG' - except Exception: - pass - - if not redis_running: - # Try to start Redis as a non-root user - try: - if os.path.exists('/etc/redis/redis.conf'): - # Custom Redis config for non-root - subprocess.run(['redis-server', '/etc/redis/redis.conf'], - check=False, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - else: - # Fallback to basic Redis without custom config - subprocess.run(['redis-server', '--daemonize', 'yes'], - check=False, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except Exception as e: - logger.warning(f"Could not start Redis: {e}") - st.warning("Redis service could not be started. Celery tasks will not work properly.") - - # Create directories for intercepted data - os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True) - - return True - except Exception as e: - logger.error(f"Service check error: {e}") - return False -# -------------------- Network Interception Classes -------------------- -class NetworkInterceptor: - """Class to intercept network traffic using mitmproxy""" - - def __init__(self, intercept_types=None, save_path=None): - self.intercept_types = intercept_types or ["xhr", "fetch", "document"] - self.save_path = save_path or "./intercepted_data" - os.makedirs(self.save_path, exist_ok=True) - self.captured_data = [] - - def intercept_request(self, flow): - """Process intercepted requests""" - try: - url = flow.request.url - method = flow.request.method - content_type = flow.request.headers.get("Content-Type", "") - - # Log the request - self.captured_data.append({ - "type": "request", - "url": url, - "method": method, - "headers": dict(flow.request.headers), - "timestamp": time.time() - }) - - logger.info(f"Intercepted {method} request to {url}") - except Exception as e: - logger.error(f"Error intercepting request: {e}") - - def intercept_response(self, flow): - """Process intercepted responses""" - try: - url = flow.request.url - status_code = flow.response.status_code - content_type = flow.response.headers.get("Content-Type", "") - - # Only process responses of interest based on content type - if any(t in content_type.lower() for t in ["application/pdf", "application/msword", - "application/vnd.openxmlformats", - "application/zip"]): - # Save the file - filename = os.path.basename(urlparse(url).path) - if not filename or filename == '/': - filename = f"file_{int(time.time())}" - - # Try to add extension based on content type - if "pdf" in content_type: - filename += ".pdf" - elif "msword" in content_type: - filename += ".doc" - elif "openxmlformats" in content_type and "wordprocessingml" in content_type: - filename += ".docx" - elif "zip" in content_type: - filename += ".zip" - - file_path = os.path.join(self.save_path, filename) - with open(file_path, "wb") as f: - f.write(flow.response.content) - - logger.info(f"Saved intercepted file: {file_path}") - - # Record metadata about the captured file - self.captured_data.append({ - "type": "file", - "url": url, - "content_type": content_type, - "size": len(flow.response.content), - "path": file_path, - "timestamp": time.time() - }) - except Exception as e: - logger.error(f"Error intercepting response: {e}") - - def get_captured_files(self): - """Return list of captured files""" - return [item for item in self.captured_data if item["type"] == "file"] - -# -------------------- Browser Automation Classes -------------------- -class MultiEngineBrowser: - """Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)""" - - def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True): - self.engine = engine +# -------------------- Download Manager Class -------------------- +class DownloadManager: + def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): self.use_proxy = use_proxy self.proxy = proxy - self.stealth = stealth + self.query = query + self.num_results = num_results + self.playwright = None self.browser = None self.context = None self.page = None - - async def setup(self): - """Initialize browser based on selected engine""" - if self.engine == "playwright": - return await self.setup_playwright() - elif self.engine == "pyppeteer": - return await self.setup_pyppeteer() - elif self.engine == "splash": - return await self.setup_splash() - else: - raise ValueError(f"Unsupported browser engine: {self.engine}") - - async def setup_playwright(self): - """Setup Playwright browser""" - from playwright.async_api import async_playwright - + self.use_stealth = use_stealth + self.proxy_rotation = proxy_rotation + self.request_count = 0 + self.captcha_detected = False + self.download_timeout = 300 # 5 minutes timeout for downloads + + async def __aenter__(self): self.playwright = await async_playwright().start() + + # Prepare browser args with stealth settings browser_args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', + '--disable-gpu', + '--no-zygote', + '--single-process', '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', + '--disable-features=IsolateOrigins', + '--disable-site-isolation-trials' ] - if self.stealth: + # Add stealth-specific args + if self.use_stealth: browser_args.extend([ '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins' + '--disable-features=IsolateOrigins,site-per-process', + '--disable-webgl', + '--disable-webrtc' ]) - launch_options = { + # Setup browser options + opts = { "headless": True, "args": browser_args } + # Configure proxy if specified if self.use_proxy and self.proxy: - launch_options["proxy"] = {"server": self.proxy} + opts["proxy"] = {"server": self.proxy} - self.browser = await self.playwright.chromium.launch(**launch_options) + # Launch browser with options + self.browser = await self.playwright.chromium.launch(**opts) - context_options = { - "viewport": {"width": 1920, "height": 1080}, + # Setup browser context with enhanced settings + context_opts = { "user_agent": get_random_user_agent(), - "bypass_csp": True, + "viewport": {"width": 1920, "height": 1080}, + "device_scale_factor": 1, + "has_touch": False, + "is_mobile": False, "ignore_https_errors": True, "accept_downloads": True } - self.context = await self.browser.new_context(**context_options) - - # Apply stealth features - if self.stealth: + # Apply stealth-specific settings to the context + if self.use_stealth: + # Apply JS-injection for enhanced stealth + context_opts["bypass_csp"] = True + self.context = await self.browser.new_context(**context_opts) + + # Execute stealth JS to avoid detection await self.context.add_init_script(""" - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, }); - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - window.chrome = { runtime: {} }; - """) - - self.page = await self.context.new_page() - return self.page - - async def setup_pyppeteer(self): - """Setup Pyppeteer browser""" - from pyppeteer import launch - - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - ] - - if self.stealth: - browser_args.extend([ - '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins' - ]) - - launch_options = { - "headless": True, - "args": browser_args, - "ignoreHTTPSErrors": True, - "userDataDir": tempfile.mkdtemp() - } - - if self.use_proxy and self.proxy: - browser_args.append(f'--proxy-server={self.proxy}') - - self.browser = await launch(launch_options) - self.page = await self.browser.newPage() - - # Set user agent - await self.page.setUserAgent(get_random_user_agent()) - - # Set viewport - await self.page.setViewport({"width": 1920, "height": 1080}) - - # Apply stealth features - if self.stealth: - await self.page.evaluateOnNewDocument(""" - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) + + // Change navigator properties + const newProto = navigator.__proto__; + delete newProto.webdriver; + + // Overwrite the plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) }); - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - window.chrome = { runtime: {} }; - """) - - return self.page - - async def setup_splash(self): - """Setup Splash browser through API""" - # Splash is typically used via HTTP API - # We'll use requests for this - self.splash_url = "http://localhost:8050/render.html" - return None # No actual page object for Splash - - async def goto(self, url, wait_until=None, timeout=30000): - """Navigate to a URL""" - if self.engine == "playwright": - return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout) - elif self.engine == "pyppeteer": - return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout) - elif self.engine == "splash": - # Use Splash HTTP API - params = { - "url": url, - "wait": min(timeout/1000, 30), # Splash uses seconds - "timeout": min(timeout/1000, 60), - "resource_timeout": min(timeout/1000, 30), - "html": 1, - "png": 0, - "render_all": 1 + + // Handle languages more naturally + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + + // Modify deviceMemory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + + // WebGL modifications + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) { + return 'Intel Inc.'; + } + if (parameter === 37446) { + return 'Intel Iris OpenGL Engine'; + } + return getParameter.apply(this, arguments); + }; } - - if self.use_proxy and self.proxy: - params["proxy"] = self.proxy - - headers = {"User-Agent": get_random_user_agent()} - response = requests.get(self.splash_url, params=params, headers=headers) - self.last_html = response.text - return response - - async def content(self): - """Get page content""" - if self.engine == "playwright": - return await self.page.content() - elif self.engine == "pyppeteer": - return await self.page.content() - elif self.engine == "splash": - return self.last_html - - async def close(self): - """Close browser""" - if self.engine == "playwright": - if self.browser: - await self.browser.close() - if self.playwright: - await self.playwright.stop() - elif self.engine == "pyppeteer": - if self.browser: - await self.browser.close() - # No cleanup needed for Splash as it's stateless - -# -------------------- Download Manager Class -------------------- -class DownloadManager: - def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True): - self.browser_engine = browser_engine - self.use_proxy = use_proxy - self.proxy = proxy - self.query = query - self.num_results = num_results - self.use_stealth = use_stealth - self.browser = None - self.network_interceptor = None + """) + else: + # Regular context without stealth + self.context = await self.browser.new_context(**context_opts) - # Configure network interception if enabled - if NETWORK_INTERCEPTOR_CONFIG["enabled"]: - self.network_interceptor = NetworkInterceptor( - intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"], - save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"] - ) - - async def __aenter__(self): - # Initialize multi-engine browser - self.browser = MultiEngineBrowser( - engine=self.browser_engine, - use_proxy=self.use_proxy, - proxy=self.proxy, - stealth=self.use_stealth - ) - self.page = await self.browser.setup() + # Create page with enhanced headers + self.page = await self.context.new_page() + await self.page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Cache-Control': 'max-age=0', + 'DNT': '1', # Do Not Track + 'Referer': 'https://www.google.com/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + }) - # Set headers for better stealth - if self.browser_engine == "playwright": - await self.page.set_extra_http_headers({ - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Referer': 'https://www.google.com/', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'cross-site', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1' - }) + # Add delay for mouse movements to simulate human behavior + if self.use_stealth: + await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) + await self.page.wait_for_timeout(random.randint(200, 500)) return self async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.browser.close() + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + + async def rotate_proxy_if_needed(self): + """Rotate proxy if proxy rotation is enabled and threshold is reached""" + if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: + self.request_count += 1 + if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: + # Get next proxy from the pool + next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) + PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list + + # Close existing context and create new one with the new proxy + if self.context: + await self.context.close() + + # Create new context with the new proxy + context_opts = { + "user_agent": get_random_user_agent(), + "proxy": {"server": next_proxy}, + "accept_downloads": True + } + self.context = await self.browser.new_context(**context_opts) + self.page = await self.context.new_page() + + # Reset counter + self.request_count = 0 + logger.info(f"Rotated to new proxy: {next_proxy}") + + async def handle_captcha(self, page): + """Detect and handle captchas if possible""" + # Check for common captcha patterns + content = await page.content() + if detect_captcha(content): + self.captcha_detected = True + logger.warning("Captcha detected on page") + + # Strategies for handling captchas: + # 1. For simple captchas, try to extract the image and solve it + captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') + if captcha_img: + logger.info("Found captcha image, attempting to capture") + + # Take screenshot of the captcha + captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") + await captcha_img.screenshot(path=captcha_path) + + # In a real implementation, you would send this to a captcha solving service + # For now, just log the detection + logger.info(f"Captcha image saved to {captcha_path}") + + # For demonstration, we'll notify the user but not actually solve it + return False + + # 2. For reCAPTCHA, special handling would be required + recaptcha = await page.query_selector('iframe[src*="recaptcha"]') + if recaptcha: + logger.warning("reCAPTCHA detected, would require external solving service") + return False + + # 3. Try to perform human-like actions that might bypass simple bot checks + await self.perform_human_actions(page) + + # Check if captcha is still present + content = await page.content() + if detect_captcha(content): + logger.warning("Captcha still present after human-like actions") + return False + else: + logger.info("Captcha appears to be resolved") + return True + + return True # No captcha detected + + async def perform_human_actions(self, page): + """Perform human-like actions on the page to possibly bypass simple bot checks""" + try: + # 1. Slowly scroll down the page + for i in range(3): + await page.evaluate(f"window.scrollTo(0, {i * 300})") + await page.wait_for_timeout(random.randint(300, 700)) + + # 2. Random mouse movements + for _ in range(3): + x = random.randint(100, 800) + y = random.randint(100, 600) + await page.mouse.move(x=x, y=y) + await page.wait_for_timeout(random.randint(200, 500)) + + # 3. Click on a non-essential part of the page + try: + await page.click("body", position={"x": 50, "y": 50}) + except: + pass + + # 4. Wait a bit before continuing + await page.wait_for_timeout(1000) + + except Exception as e: + logger.warning(f"Error during human-like actions: {e}") - async def search_web(self, search_engine="bing"): - """Search web using specified search engine""" + async def search_bing(self): urls = [] try: - if search_engine == "bing": - search_url = f"https://www.bing.com/search?q={self.query}" - elif search_engine == "google": - search_url = f"https://www.google.com/search?q={self.query}" - else: - raise ValueError(f"Unsupported search engine: {search_engine}") - - await self.browser.goto(search_url, timeout=30000) - - if self.browser_engine == "playwright": - if search_engine == "bing": - links = await self.page.query_selector_all("li.b_algo h2 a") - for link in links[:self.num_results]: - href = await link.get_attribute('href') - if href: - urls.append(href) - elif search_engine == "google": - links = await self.page.query_selector_all("div.g a[href^='http']") - for link in links[:self.num_results]: - href = await link.get_attribute('href') - if href: - urls.append(href) - elif self.browser_engine == "pyppeteer": - if search_engine == "bing": - links = await self.page.querySelectorAll("li.b_algo h2 a") - for link in links[:self.num_results]: - href = await self.page.evaluate('el => el.getAttribute("href")', link) - if href: - urls.append(href) - elif search_engine == "google": - links = await self.page.querySelectorAll("div.g a[href^='http']") - for link in links[:self.num_results]: - href = await self.page.evaluate('el => el.getAttribute("href")', link) - if href: - urls.append(href) - elif self.browser_engine == "splash": - # Parse the HTML with BeautifulSoup - soup = BeautifulSoup(self.browser.last_html, 'html.parser') - if search_engine == "bing": - links = soup.select("li.b_algo h2 a") - for link in links[:self.num_results]: - href = link.get("href") - if href: - urls.append(href) - elif search_engine == "google": - links = soup.select("div.g a[href^='http']") - for link in links[:self.num_results]: - href = link.get("href") - if href: - urls.append(href) + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + search_url = f"https://www.bing.com/search?q={self.query}" + await self.page.goto(search_url, timeout=30000) + await self.page.wait_for_load_state('networkidle') + + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected during search, results may be limited") + + # More natural scrolling behavior + for i in range(3): + await self.page.evaluate(f"window.scrollTo(0, {i * 400})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Extract search results + links = await self.page.query_selector_all("li.b_algo h2 a") + for link in links[:self.num_results]: + href = await link.get_attribute('href') + if href: + urls.append(href) + + # If we didn't find enough results, try an alternative selector + if len(urls) < self.num_results: + alt_links = await self.page.query_selector_all(".b_caption a") + for link in alt_links: + href = await link.get_attribute('href') + if href and href not in urls: + urls.append(href) + if len(urls) >= self.num_results: + break return urls except Exception as e: - logger.error(f"Error searching web: {e}") + logger.error(f"Error searching Bing: {e}") return [] async def get_file_size(self, url): try: - headers = {'User-Agent': get_random_user_agent()} - response = requests.head(url, headers=headers, timeout=15) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - else: - return "Unknown Size" - except Exception: + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + response = await page.request.head(url, timeout=15000) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + else: + return "Unknown Size" + except Exception as e: + logger.warning(f"Error getting file size: {e}") return "Unknown Size" async def get_pdf_metadata(self, url): try: - headers = {'User-Agent': get_random_user_agent()} - response = requests.get(url, headers=headers, timeout=15, stream=True) - if response.status_code == 200: - content = BytesIO(response.content) - reader = PdfReader(content) - return { - 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', - 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', - 'Pages': len(reader.pages), - } - else: - return {} - except Exception: + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + resp = await page.request.get(url, timeout=15000) + if resp.ok: + content = await resp.body() + pdf = BytesIO(content) + reader = PdfReader(pdf) + return { + 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', + 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', + 'Pages': len(reader.pages), + } + else: + return {} + except Exception as e: + logger.warning(f"Error reading PDF metadata: {e}") return {} async def extract_real_download_url(self, url): try: - headers = {'User-Agent': get_random_user_agent()} - response = requests.head(url, headers=headers, timeout=15, allow_redirects=True) - return response.url + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + response = await page.goto(url, wait_until='networkidle', timeout=30000) + if response and response.headers.get('location'): + return response.headers['location'] + return page.url except Exception as e: logger.error(f"Error extracting real download URL: {e}") return url + # IMPROVED: Enhanced exam links extraction method async def get_edu_exam_links(self, url): """Specialized method for educational exam websites that follows a common pattern.""" try: logger.info(f"Fetching exam links from {url}") links = set() - # First try with direct requests for speed - headers = {"User-Agent": get_random_user_agent()} + # First try with direct requests for speed (but with proper headers) + headers = { + "User-Agent": get_random_user_agent(), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Referer": "https://www.google.com/", + "DNT": "1" + } + try: response = requests.get(url, headers=headers, timeout=30) if response.status_code == 200: - # Parse with BeautifulSoup for efficiency + # Parse with BeautifulSoup first for efficiency soup = BeautifulSoup(response.text, "html.parser") parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" @@ -696,63 +609,239 @@ class DownloadManager: "view", "open", "get", "solution", "answer" ] - # Check URL and text patterns - if any(pattern in full_url.lower() for pattern in url_patterns) or \ - any(pattern in link_text for pattern in text_patterns) or \ - any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + # Check URL for patterns + if any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + continue + + # Check link text for patterns + if any(pattern in link_text for pattern in text_patterns): + links.add(full_url) + continue + + # Check for common file extensions + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(full_url) except Exception as e: logger.warning(f"Request-based extraction failed: {e}") - # Use browser-based approach if needed - if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: - logger.info("Using browser for enhanced link extraction") - - # Navigate to the page - await self.browser.goto(url, timeout=45000) - - # Get page content and parse with BeautifulSoup - content = await self.browser.content() - soup = BeautifulSoup(content, "html.parser") - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Process all links on the page - for a in soup.find_all("a", href=True): - href = a["href"] - full_url = urljoin(url, href) - link_text = a.get_text().lower() - - # Apply the same filtering criteria - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_", - "/resource/", "/subject/", "/course/", "/material/" - ] - - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "subject", "course", "resource", "pdf", "document", - "view", "open", "get", "solution", "answer" - ] - - # Check URL and text patterns - if any(pattern in full_url.lower() for pattern in url_patterns) or \ - any(pattern in link_text for pattern in text_patterns) or \ - any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) + # Browser-based approach for more thorough extraction or if initial approach was inadequate + try: + # Check if we need to proceed with browser-based extraction + if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: + logger.info("Using browser for enhanced link extraction") + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Navigate to the page with more natural timing + await self.page.goto(url, timeout=45000, wait_until='networkidle') + await self.page.wait_for_timeout(random.randint(1000, 2000)) + + # Handle captchas if present + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, extraction may be limited") + + # Get base URL for resolving relative links + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Perform natural scrolling to trigger lazy-loaded content + page_height = await self.page.evaluate("document.body.scrollHeight") + viewport_height = await self.page.evaluate("window.innerHeight") + + for scroll_pos in range(0, page_height, viewport_height // 2): + await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Scroll back to top + await self.page.evaluate("window.scrollTo(0, 0)") + await self.page.wait_for_timeout(500) + + # Extract all links with Playwright (better than just anchor tags) + all_links = await self.page.evaluate(""" + () => { + const results = []; + + // Get all anchor tags + const anchors = document.querySelectorAll('a[href]'); + for (const a of anchors) { + if (a.href) { + results.push({ + href: a.href, + text: a.innerText || a.textContent || '', + isButton: a.classList.contains('btn') || a.role === 'button' + }); + } + } + + // Get buttons that might contain links + const buttons = document.querySelectorAll('button'); + for (const btn of buttons) { + const onclick = btn.getAttribute('onclick') || ''; + if (onclick.includes('window.location') || onclick.includes('download')) { + results.push({ + href: '#button', + text: btn.innerText || btn.textContent || '', + isButton: true, + onclick: onclick + }); + } + } + + return results; + } + """) + + # Process the extracted links + for link_info in all_links: + href = link_info.get('href', '') + text = link_info.get('text', '').lower() + + if href and href != '#button': + # Check URL patterns + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_" + ] + + # Check text patterns + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "pdf", "document", "view", "open", "solution" + ] + + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(pattern in text for pattern in text_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + + # Check for ASP.NET specific elements that might contain exam links + grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') + for grid in grid_elements: + grid_links = await grid.query_selector_all('a[href]') + for a in grid_links: + href = await a.get_attribute('href') + text = await a.text_content() + + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + links.add(full_url) + + # Try clicking pagination controls to reveal more content + pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') + for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons + try: + # Check if this is a numeric pagination button (more likely to be useful) + button_text = await button.text_content() + if button_text and button_text.strip().isdigit(): + logger.info(f"Clicking pagination button: {button_text}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract links from this page + new_page_links = await self.page.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } + """) + + for href in new_page_links: + if href and not href.startswith('javascript:'): + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + except Exception as e: + logger.warning(f"Error clicking pagination button: {e}") + + # Try clicking any controls that might reveal more exam links (more focused approach) + show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') + for button in show_buttons: + button_text = (await button.text_content() or "").lower() + button_value = (await button.get_attribute("value") or "").lower() + button_id = (await button.get_attribute("id") or "").lower() + + # Look for buttons that seem likely to reveal file lists + promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", + "download", "resource", "material", "browse", "file"] + + if any(term in button_text or term in button_value or term in button_id + for term in promising_terms): + try: + logger.info(f"Clicking button: {button_text or button_value}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Get any new links that appeared + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + + # Focus on file extensions and patterns + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ + any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Special handling for ASP.NET PostBack links + try: + # Find and interact with ASP.NET __doPostBack elements + postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') + for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks + try: + onclick = await element.get_attribute('onclick') + if onclick and '__doPostBack' in onclick: + element_text = await element.text_content() + + # Only interact with elements that seem likely to contain exam links + promising_terms = ["show", "view", "list", "exam", "paper", "test", + "download", "resource", "material"] + + if any(term in element_text.lower() for term in promising_terms): + logger.info(f"Clicking ASP.NET postback element: {element_text}") + + # Click the element + await element.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract any new links + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) + except Exception as e: + logger.warning(f"Error interacting with postback element: {e}") + except Exception as e: + logger.warning(f"Error during postback handling: {e}") + + except Exception as e: + logger.error(f"Browser-based extraction failed: {e}") - # Filter to likely exam documents + # Filter links to likely contain exam documents filtered_links = [] for link in links: - # Common file extensions + # Common file extensions for exam documents if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): filtered_links.append(link) continue - + # Common paths for exam documents if any(pattern in link.lower() for pattern in [ "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", @@ -771,9 +860,12 @@ class DownloadManager: async def extract_downloadable_files(self, url, custom_ext_list): found_files = [] try: + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + # Special handling for educational exam sites if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): + ["exam", "test", "pastpaper", "eduexp"]): logger.info("Using specialized handler for educational exam site") # Get direct links to exam files @@ -812,54 +904,102 @@ class DownloadManager: 'url': real_url, 'filename': filename, 'size': size_str, - 'metadata': meta, - 'source_url': url # Add source URL for better tracking + 'metadata': meta }) # If we found exam files with the specialized method, return them if found_files: return found_files - # Standard extraction method for all pages - await self.browser.goto(url, timeout=30000) + # Standard extraction method if specialized method didn't find files + response = await self.page.goto(url, timeout=30000, wait_until='networkidle') + if not response: + return [] - # Get page content - content = await self.browser.content() + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, file extraction may be limited") + + # Scroll through the page naturally to trigger lazy loading + await self.page.evaluate(""" + (async () => { + const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const scrollStep = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += scrollStep) { + window.scrollTo(0, i); + await delay(100); + } + + window.scrollTo(0, 0); + })() + """) + await self.page.wait_for_timeout(1000) + + final_url = self.page.url + if '.php' in final_url or 'download' in final_url: + real_url = await self.extract_real_download_url(final_url) + if real_url != final_url: + # Try to detect the filename from headers or URL + response = await self.page.request.head(real_url, timeout=15000) + filename = None + + # Try to get from Content-Disposition header + content_disposition = response.headers.get('Content-Disposition', '') + if 'filename=' in content_disposition: + filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) + if filename_match: + filename = filename_match.group(1) + + # If not found in headers, use URL basename + if not filename: + filename = os.path.basename(urlparse(real_url).path) + if not filename or filename == '/': + # Generate a name based on domain + domain = get_domain(real_url) + ext = get_file_extension(real_url, '.pdf') + filename = f"file_from_{domain}{ext}" + + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': await self.get_file_size(real_url), + 'metadata': {} + }) + return found_files + + await self.page.wait_for_load_state('networkidle', timeout=30000) + content = await self.page.content() soup = BeautifulSoup(content, 'html.parser') - # Define file extensions to look for default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', - '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', - '.pptx', '.odt', '.txt'] + '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', + '.pptx', '.odt', '.txt'] all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) - # Get base URL for resolving relative links - parsed_base = urlparse(url) + parsed_base = urlparse(final_url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" path_base = os.path.dirname(parsed_base.path) - # Process all anchor tags for file links + # Process all anchor tags for a in soup.find_all('a', href=True): href = a['href'].strip() - # Handle PHP and download links separately if '.php' in href.lower() or 'download' in href.lower(): - full_url = href if href.startswith('http') else urljoin(base_url, href) + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) real_url = await self.extract_real_download_url(full_url) if real_url and real_url != full_url: - filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file' found_files.append({ 'url': real_url, - 'filename': filename, + 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', 'size': await self.get_file_size(real_url), - 'metadata': {}, - 'source_url': url + 'metadata': {} }) continue - - # Check for direct file extensions + if any(href.lower().endswith(ext) for ext in all_exts): - file_url = href if href.startswith('http') else urljoin(base_url, href) + file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) size_str = await self.get_file_size(file_url) meta = {} if file_url.lower().endswith('.pdf'): @@ -868,10 +1008,9 @@ class DownloadManager: 'url': file_url, 'filename': os.path.basename(file_url.split('?')[0]), 'size': size_str, - 'metadata': meta, - 'source_url': url + 'metadata': meta }) - + # Handle Google Drive links elif ("drive.google.com" in href) or ("docs.google.com" in href): file_id = None @@ -880,131 +1019,299 @@ class DownloadManager: if match: file_id = match.group(1) break - if file_id: - # Determine if it's a view-only file - is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")) + # Get file info to determine type and view-only status + file_type, is_view_only = await self.get_google_drive_file_info(file_id) + # Create a more informative filename based on info filename = f"gdrive_{file_id}" - ext = get_file_extension(href, '.pdf') - if ext != '.': - filename += ext + if file_type: + filename = f"{filename}.{file_type}" + + size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") found_files.append({ - 'url': href, + 'url': href, # Use original URL 'filename': filename, - 'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"), + 'size': size_str, 'metadata': { 'view_only': is_view_only, + 'file_type': file_type, 'file_id': file_id - }, - 'source_url': url + } }) - # Check for embedded content (iframe, embed, object) - for elem_tag in ['iframe', 'embed', 'object', 'source']: - for elem in soup.find_all(elem_tag): - src = elem.get('src') or elem.get('data') - if src and any(src.lower().endswith(ext) for ext in all_exts): - file_url = src if src.startswith('http') else urljoin(base_url, src) + # Also check for files in other elements (iframe, embed, object, etc.) + other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) + for elem in other_elements: + src = elem.get('src') or elem.get('data') + if src and any(src.lower().endswith(ext) for ext in all_exts): + file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + size_str = await self.get_file_size(file_url) + meta = {} + if file_url.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(file_url) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': size_str, + 'metadata': meta + }) + + # Check for file links in onclick attributes + onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') + for elem in onclick_elements: + onclick = await elem.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url_match in urls: + if any(url_match.lower().endswith(ext) for ext in all_exts): + size_str = await self.get_file_size(url_match) + meta = {} + if url_match.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(url_match) found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': await self.get_file_size(file_url), - 'metadata': {}, - 'source_url': url + 'url': url_match, + 'filename': os.path.basename(url_match.split('?')[0]), + 'size': size_str, + 'metadata': meta }) - # Deduplicate files + # Also check for data-src and data-url attributes (common in lazy-loaded sites) + data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') + for elem in data_elements: + for attr in ['data-src', 'data-url', 'data-href', 'data-download']: + try: + value = await elem.get_attribute(attr) + if value and any(value.lower().endswith(ext) for ext in all_exts): + file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': await self.get_file_size(file_url), + 'metadata': {} + }) + except: + pass + + # Check script tags for JSON data that might contain file URLs + script_elements = soup.find_all('script', type='application/json') + for script in script_elements: + try: + json_data = json.loads(script.string) + # Look for URL patterns in the JSON data + def extract_urls_from_json(obj, urls_found=None): + if urls_found is None: + urls_found = [] + if isinstance(obj, dict): + for k, v in obj.items(): + # Check if any key contains url-like terms + url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] + if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): + urls_found.append(v) + else: + extract_urls_from_json(v, urls_found) + elif isinstance(obj, list): + for item in obj: + extract_urls_from_json(item, urls_found) + return urls_found + + json_urls = extract_urls_from_json(json_data) + for json_url in json_urls: + if any(json_url.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': json_url, + 'filename': os.path.basename(json_url.split('?')[0]), + 'size': await self.get_file_size(json_url), + 'metadata': {} + }) + except: + pass + + # Check for hidden download buttons or forms + hidden_elements = await self.page.evaluate(""" + () => { + const results = []; + + // Check for hidden forms with download actions + const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); + for (const form of forms) { + const action = form.getAttribute('action') || ''; + results.push({ + type: 'form', + action: action, + inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { + return {name: input.name, value: input.value}; + }) + }); + } + + // Check for hidden download links/buttons + const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { + const style = window.getComputedStyle(a); + return (style.display === 'none' || style.visibility === 'hidden') && + (a.href.includes('download') || a.href.includes('file')); + }); + + for (const link of hiddenLinks) { + results.push({ + type: 'link', + href: link.href, + text: link.innerText || link.textContent + }); + } + + return results; + } + """) + + # Process hidden elements + for elem in hidden_elements: + if elem['type'] == 'link' and 'href' in elem: + href = elem['href'] + if any(href.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': href, + 'filename': os.path.basename(href.split('?')[0]), + 'size': await self.get_file_size(href), + 'metadata': {} + }) + + # Deduplicate files by URL seen_urls = set() unique_files = [] for f in found_files: if f['url'] not in seen_urls: seen_urls.add(f['url']) unique_files.append(f) - - return unique_files + return unique_files except Exception as e: logger.error(f"Error extracting files from {url}: {e}") + traceback.print_exc() return [] - async def download_file(self, file_info, save_dir, referer=None): - """Download a file and provide a direct download link""" + async def download_file(self, file_info, save_dir, referer): file_url = file_info['url'] fname = file_info['filename'] - referer = referer or file_info.get('source_url', 'https://www.google.com') - - # Create unique filename to avoid overwriting path = os.path.join(save_dir, fname) base, ext = os.path.splitext(fname) counter = 1 while os.path.exists(path): path = os.path.join(save_dir, f"{base}_{counter}{ext}") counter += 1 - os.makedirs(save_dir, exist_ok=True) try: # Special handling for Google Drive files if "drive.google.com" in file_url or "docs.google.com" in file_url: - # For view-only Google Drive files, use specialized method + # Check if it's marked as view-only in metadata is_view_only = file_info.get('metadata', {}).get('view_only', False) + + # For view-only files, try our most robust approach first if is_view_only: - result_path = await self.download_viewonly_google_drive(file_info, path) + logger.info(f"Attempting to download view-only file: {file_url}") + result_path = await self.force_download_viewonly(file_info, path) if result_path: return result_path + + # If that failed, try the regular download approach + logger.info("Primary method failed, trying fallback methods") - # Try standard Google Drive download - file_id = None - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, file_url) - if match: - file_id = match.group(1) - break + # Try regular download methods + success = await self.download_from_google_drive(file_url, path) + if success: + return path - if file_id: - # Try direct download - download_url = f"https://drive.google.com/uc?id={file_id}&export=download" - headers = { - 'User-Agent': get_random_user_agent(), - 'Referer': referer - } + # If all methods failed for Google Drive, try one last approach + logger.warning("All standard methods failed, attempting force download") + result_path = await self.force_download_viewonly(file_info, path) + return result_path if result_path else None + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Try with direct requests first (faster) + try: + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer, + 'DNT': '1' + } + + with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: + if response.status_code == 200: + # Check content type to verify it's not HTML/error page + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_url.endswith('.html'): + logger.warning(f"Received HTML instead of expected file: {file_url}") + else: + with open(path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file was downloaded correctly + if os.path.exists(path) and os.path.getsize(path) > 0: + return path + except Exception as e: + logger.warning(f"Direct download failed: {e}, trying browser approach") - with requests.get(download_url, headers=headers, stream=True) as r: - r.raise_for_status() + # Original code for non-Google Drive downloads using Playwright + async with self.context.new_page() as page: + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer + } + + # Try to download with timeout protection + try: + response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) + if response.status == 200: + content = await response.body() with open(path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) + f.write(content) + return path + else: + logger.error(f"Download failed with status {response.status}: {file_url}") + + # Try to extract error information + error_info = await response.text() + logger.debug(f"Error response: {error_info[:200]}...") + + # Check if this might be a captcha or login issue + if detect_captcha(error_info): + logger.warning("Captcha detected during download") + # For HF Spaces, we can't implement browser-based captcha solving here + # Just log the issue for now + except PlaywrightTimeoutError: + logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") + + # Try an alternative approach - using the browser's download manager + try: + logger.info("Trying browser download manager approach") + download_promise = page.wait_for_event("download") + await page.goto(file_url, timeout=60000) + + # Wait for download to start (with timeout) + download = await download_promise + await download.save_as(path) if os.path.exists(path) and os.path.getsize(path) > 0: return path - - # Standard file download - headers = { - 'User-Agent': get_random_user_agent(), - 'Referer': referer, - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br' - } - - with requests.get(file_url, headers=headers, stream=True) as r: - r.raise_for_status() - with open(path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - - if os.path.exists(path) and os.path.getsize(path) > 0: - return path - else: - return None + except Exception as e: + logger.error(f"Browser download manager approach failed: {e}") + return None except Exception as e: logger.error(f"Error downloading {file_url}: {e}") return None - async def download_viewonly_google_drive(self, file_info, save_path): - """Download view-only Google Drive documents""" + # IMPROVED: Enhanced view-only document download method + async def force_download_viewonly(self, file_info, save_path): + """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" try: # Extract file ID file_id = file_info.get('metadata', {}).get('file_id') @@ -1020,350 +1327,1353 @@ class DownloadManager: logger.error("Could not extract file ID") return None - # Determine file type - file_type = get_file_extension(file_info['url'], '.pdf').lstrip('.') - - # Ensure appropriate extension on save path + file_type = file_info.get('metadata', {}).get('file_type', 'pdf') base, ext = os.path.splitext(save_path) if not ext: save_path = f"{base}.{file_type}" - logger.info(f"Downloading view-only Google Drive file: {file_id}") - - # Create a dedicated browser session - if self.browser_engine == "playwright": - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch( - headless=True, - args=[ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-site-isolation-trials', - '--disable-blink-features=AutomationControlled' - ] - ) + logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") + + # Create a dedicated browser instance with better resolution and stealth + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-blink-features=AutomationControlled' # Anti-detection + ] + + browser = await self.playwright.chromium.launch( + headless=True, + args=browser_args + ) + + # Use higher resolution for better quality + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + device_scale_factor=2.0, + accept_downloads=True # Critical for the download workflow + ) + + # Add anti-detection script + await context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); - # Create context with options for better handling - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent=get_random_user_agent(), - accept_downloads=True, - ignore_https_errors=True - ) + // Change plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + // Handle languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + } + """) + + page = await context.new_page() + + try: + # Go to the file view page + logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) + await page.wait_for_load_state('networkidle') + + # Check for any barriers or permissions issues + content = await page.content() + if "the owner has not granted you permission to" in content: + logger.warning("Permission denied error detected") + + # Randomized wait to appear more human-like + await page.wait_for_timeout(random.randint(3000, 7000)) + + # Create temp directory + temp_dir = tempfile.mkdtemp() + + # Special handling for PDFs + if file_type.lower() == 'pdf': + # Use the improved scrolling and detection approach + + # Perform some natural mouse movements and scrolling + await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) + await page.wait_for_timeout(random.randint(500, 1000)) - # Add stealth script - await context.add_init_script(""" - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) + # Estimate number of pages + estimated_pages = await page.evaluate(""" + () => { + // Method 1: Check page counter text + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); }); - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - window.chrome = { runtime: {} }; + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // Method 2: Check actual page elements + const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pageElements.length > 0) return pageElements.length; + + // Method 3: Look for page thumbnails + const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); + if (thumbnails.length > 0) return thumbnails.length; + + // Fallback: conservative guess + return 50; + } """) - page = await context.new_page() + logger.info(f"Estimated {estimated_pages} pages in PDF") - try: - # Visit the file - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) - await page.wait_for_load_state('networkidle') + # Initial scroll to trigger lazy loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Scroll page by page to ensure all pages are loaded + logger.info("Scrolling page by page...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + + while attempt < max_attempts: + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) - # Wait for content to load - await page.wait_for_timeout(5000) + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - # Create temporary directory for processing - temp_dir = tempfile.mkdtemp() + if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): + logger.info("All pages appear to be loaded.") + break - # For PDF handling - if file_type == 'pdf': - # Create directory for screenshots - screenshots_dir = os.path.join(temp_dir, "screenshots") - os.makedirs(screenshots_dir, exist_ok=True) + # Alternate between PageDown and End keys for more natural scrolling + if attempt % 3 == 0: + await page.keyboard.press("End") + else: + await page.keyboard.press("PageDown") - # Get page count - total_pages = await page.evaluate(""" - () => { - // Look for page counters in the interface - const pageCounters = document.querySelectorAll('*'); - for (const el of pageCounters) { - const text = el.textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) { - return parseInt(match[2]); + # Randomized wait times + await page.wait_for_timeout(random.randint(1500, 3000)) + + # Move mouse randomly to appear more human-like + if attempt % 4 == 0: + await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is loaded + await page.wait_for_timeout(5000) + + # Set up download event listener for the PDF + download_promise = page.wait_for_event("download") + + # Use jsPDF to generate PDF from loaded pages + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = Array.from(document.getElementsByTagName("img")) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); + + console.log(`Found ${imgs.length} valid page images to add to PDF`); + + let added = 0; + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; } + + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); } - - // Look for paginated pages - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length > 0) return pages.length; - - // Default if we can't determine - return 20; - } - """) + }; + + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success', False): + logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") + + # Try fallback approach - screenshot method + logger.info("Trying fallback screenshot method...") + + # Navigate back to the first page + await page.evaluate(""" + () => { + // Find and click the "first page" button if available + const buttons = Array.from(document.querySelectorAll('button')); + const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); + if (firstPageBtn) firstPageBtn.click(); + } + """) + await page.wait_for_timeout(1000); + + # Create a PDF by taking screenshots of each page + screenshots = [] + current_page = 1 + max_pages = estimated_pages + + # Create a PDF using the reportlab package + while current_page <= max_pages: + screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") + + # Try to find the current page element + page_elem = await page.query_selector('.drive-viewer-paginated-page') + if page_elem: + await page_elem.screenshot(path=screenshot_path) + else: + # Fallback to full page screenshot + await page.screenshot(path=screenshot_path) + + screenshots.append(screenshot_path) + + # Try to navigate to next page + next_btn = await page.query_selector('button[aria-label="Next page"]') + if next_btn: + is_disabled = await next_btn.get_attribute('disabled') + if is_disabled: + logger.info(f"Reached end of document at page {current_page}") + break + + await next_btn.click() + await page.wait_for_timeout(1000) + current_page += 1 + else: + break + + # Create PDF from screenshots + if screenshots: + first_img = Image.open(screenshots[0]) + width, height = first_img.size - logger.info(f"PDF has approximately {total_pages} pages") + c = canvas.Canvas(save_path, pagesize=(width, height)) + for screenshot in screenshots: + img = Image.open(screenshot) + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + c.save() - # Take screenshots of each page - screenshots = [] + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) - # First try with the page element method - for i in range(min(total_pages, 100)): # Limit to 100 pages for safety - try: - # Navigate to specific page - if i > 0: - await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()") - await page.wait_for_timeout(500) + return save_path + + return None + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Wait for the download and save it + download = await download_promise + await download.save_as(save_path) + + # Clean up temp directory + try: + os.rmdir(temp_dir) + except: + pass + + else: + # Non-PDF file handling + screenshot_path = os.path.join(temp_dir, "file.png") + await page.screenshot(path=screenshot_path) + + if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: + # For document types, try to export directly + await self.export_google_doc(file_id, file_type, save_path) + else: + # For other types, save the screenshot with appropriate extension + shutil.copy(screenshot_path, save_path) + + os.remove(screenshot_path) + + # Close browser + await browser.close() + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: + logger.info(f"Successfully downloaded file to {save_path}") + return save_path + else: + logger.error(f"Generated file is too small or missing: {save_path}") + return None + + except Exception as e: + logger.error(f"Error during force download: {e}") + if browser: + await browser.close() + return None + + except Exception as e: + logger.error(f"Force download preparation failed: {e}") + return None + + async def download_from_google_drive(self, url, save_path): + """Enhanced method to download from Google Drive with multiple fallback approaches""" + # Extract the file ID from different URL formats + file_id = None + url_patterns = [ + r'drive\.google\.com/file/d/([^/]+)', + r'drive\.google\.com/open\?id=([^&]+)', + r'docs\.google\.com/\w+/d/([^/]+)', + r'id=([^&]+)', + r'drive\.google\.com/uc\?id=([^&]+)', + ] + + for pattern in url_patterns: + match = re.search(pattern, url) + if match: + file_id = match.group(1) + break + + if not file_id: + logger.error(f"Could not extract file ID from URL: {url}") + return False + + # Determine file type first (important for handling different file types) + file_type, is_view_only = await self.get_google_drive_file_info(file_id) + logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") + + base, ext = os.path.splitext(save_path) + if not ext and file_type: + # Add the correct extension if missing + save_path = f"{base}.{file_type}" + + # For view-only files, use specialized approaches + if is_view_only: + # Approach 1: For PDFs, use the JS method + if file_type == 'pdf': + success = await self.download_viewonly_pdf_with_js(file_id, save_path) + if success: + return True + + # Approach 2: For Google Docs, Sheets, etc., use export API + if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: + success = await self.export_google_doc(file_id, file_type, save_path) + if success: + return True + + # Approach 3: Try the direct screenshot method for any view-only file + success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) + if success: + return True + + # Try standard approaches for non-view-only files + try: + # Try direct download link first (fastest) + direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" + + # Add anti-bot headers + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://drive.google.com/', + 'DNT': '1' + } + + # Try with streaming to handle larger files + with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: + if r.status_code == 200: + # Check if we got HTML instead of the file + content_type = r.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_id.endswith('.html'): + logger.warning("Received HTML instead of file, trying with session cookies") + else: + # Looks like we got the actual file + with open(save_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + logger.info("Direct download successful") + return True + + # Try with requests and session cookies + session = requests.Session() + session.headers.update({'User-Agent': get_random_user_agent()}) + + # Visit the page first to get cookies + session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) + + # Try download + url = f"https://drive.google.com/uc?id={file_id}&export=download" + response = session.get(url, stream=True, timeout=30) + + # Check for confirmation token + confirmation_token = None + for k, v in response.cookies.items(): + if k.startswith('download_warning'): + confirmation_token = v + break + + # Use confirmation token if found + if confirmation_token: + url = f"{url}&confirm={confirmation_token}" + response = session.get(url, stream=True, timeout=60) + + # Check if we're getting HTML instead of the file + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type: + logger.warning("Received HTML instead of file - likely download restriction") + else: + with open(save_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=1024*1024): + if chunk: + f.write(chunk) + + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + with open(save_path, 'rb') as f: + content = f.read(100) + if b'' not in content: + logger.info("Successfully downloaded with requests session") + return True + except Exception as e: + logger.warning(f"Requests session download failed: {e}") + + # Try browser-based approach as last resort + try: + async with self.context.new_page() as page: + # Visit the file view page first to get cookies + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + await page.wait_for_timeout(3000) + + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Try to trigger the download button click + download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') + if download_button: + await download_button.click() + + # Wait for download to start + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except Exception as e: + logger.error(f"Error during browser download: {e}") + return False + else: + # Try the export download URL + await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) + + # Look for and click any download buttons or links + download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') + for elem in download_elements: + try: + await elem.click() + # Wait a bit to see if download starts + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except: + pass + except: + continue + except Exception as e: + logger.error(f"Browser-based download attempt failed: {e}") + + logger.warning("All standard download methods failed") + return False + + async def download_viewonly_pdf_with_js(self, file_id, save_path): + """Download view-only PDF using the enhanced blob image caching technique""" + try: + # Create a dedicated browser instance with stealth capabilities + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-blink-features=AutomationControlled' # Anti-detection + ] + + browser = await self.playwright.chromium.launch( + headless=True, + args=browser_args + ) + + # Setup stealth context + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + accept_downloads=True, # Critical for handling the download event + ignore_https_errors=True + ) + + # Add stealth script + await context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change plugins and languages to appear more human + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + } + """) + + page = await context.new_page() + + try: + # Step 1: Navigate to the file with human-like behavior + logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) + await page.wait_for_load_state('networkidle') + + # Perform human-like interactions + await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) + await page.wait_for_timeout(random.randint(2000, 5000)) + + # Step 2: Estimate the number of pages + estimated_pages = await page.evaluate(""" + () => { + // Look for page counter in the interface + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); + }); + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // If we can't find a counter, check actual pages + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length > 0) return pages.length; + + // Default to a reasonable number if we can't determine + return 50; + } + """) + + logger.info(f"Estimated number of pages: {estimated_pages}") + + # Step 3: Initial scroll to trigger loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Step 4: Wait for all pages to load with better feedback and randomization + logger.info("Scrolling through document to load all pages...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + consecutive_same_count = 0 + + while attempt < max_attempts: + # Count blob images (which are the PDF pages) + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) + + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + + # Check if we've loaded all pages or if we're stuck + if blob_count >= estimated_pages: + logger.info(f"All {estimated_pages} pages appear to be loaded.") + break + + if blob_count == prev_blob_count: + consecutive_same_count += 1 + if consecutive_same_count >= 5 and blob_count > 0: + logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") + break + else: + consecutive_same_count = 0 + + # Mix up the scrolling approach for more human-like behavior + scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) + + if scroll_action == "PageDown": + await page.keyboard.press("PageDown") + elif scroll_action == "End": + await page.keyboard.press("End") + elif scroll_action == "ArrowDown": + # Press arrow down multiple times + for _ in range(random.randint(5, 15)): + await page.keyboard.press("ArrowDown") + await page.wait_for_timeout(random.randint(50, 150)) + else: # mouse + # Scroll using mouse wheel + current_y = random.randint(300, 700) + await page.mouse.move(x=random.randint(300, 800), y=current_y) + await page.mouse.wheel(0, random.randint(300, 800)) + + # Random wait between scrolls + await page.wait_for_timeout(random.randint(1000, 3000)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is fully loaded + await page.wait_for_timeout(5000) + + # Step 5: Set up a download event listener + download_promise = page.wait_for_event("download") + + # Step 6: Inject the jsPDF script to generate PDF + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = document.getElementsByTagName("img"); + let validImages = []; - # Wait for the page to render - await page.wait_for_timeout(500) + // First collect all valid blob images + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + if (!/^blob:/.test(img.src)) continue; + if (img.width < 100 || img.height < 100) continue; + validImages.push(img); + } - # Take screenshot - screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") + // Sort by position in the document + validImages.sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); - # Try to find the page element - page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})') - if page_element: - await page_element.screenshot(path=screenshot_path) - else: - # Fallback to viewport screenshot - await page.screenshot(path=screenshot_path) + console.log(`Found ${validImages.length} valid page images to add to PDF`); - screenshots.append(screenshot_path) + let added = 0; + // Process each image as a page + for (let i = 0; i < validImages.length; i++) { + let img = validImages[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; + } - # Check if we should continue to next page - if i < total_pages - 1: - next_button = await page.query_selector('button[aria-label="Next page"]') - if next_button: - # Check if button is disabled - is_disabled = await next_button.get_attribute('disabled') - if is_disabled: - logger.info(f"Reached last page at page {i+1}") - break - - # Click next page - await next_button.click() - await page.wait_for_timeout(1000) - else: - logger.info("Next page button not found") - break - except Exception as e: - logger.error(f"Error capturing page {i+1}: {e}") - continue + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); + } + }; - # Create PDF from screenshots - if screenshots: - # Get dimensions from first screenshot - first_img = Image.open(screenshots[0]) - width, height = first_img.size - - # Create PDF - c = canvas.Canvas(save_path, pagesize=(width, height)) - for screenshot in screenshots: - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - c.save() - - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) - - # Clean up temp directory - shutil.rmtree(temp_dir, ignore_errors=True) + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + // Use a reliable CDN + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success'): + logger.error(f"Error in PDF generation: {result.get('error')}") + return False + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Step 7: Wait for the download to complete and save the file + download = await download_promise + + # Step 8: Save the downloaded file to the specified path + await download.save_as(save_path) + logger.info(f"Successfully saved PDF to {save_path}") + + return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 + + finally: + await browser.close() + + except Exception as e: + logger.error(f"Error in viewonly PDF download process: {e}") + return False + + async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): + """Download any view-only file by taking screenshots""" + try: + async with self.context.new_page() as page: + # Set high-resolution viewport + await page.set_viewport_size({"width": 1600, "height": 1200}) + + # Navigate to the file + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) + + # Make sure the file is loaded + await page.wait_for_load_state('networkidle') + await page.wait_for_timeout(3000) # Extra time for rendering + + # Create directory for screenshots if multiple pages + base_dir = os.path.dirname(save_path) + base_name = os.path.splitext(os.path.basename(save_path))[0] + screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") + os.makedirs(screenshots_dir, exist_ok=True) + + # Check if it's a multi-page document + is_multi_page = await page.evaluate(""" + () => { + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + return pages.length > 1; + } + """) + + if is_multi_page and file_type == 'pdf': + # For multi-page PDFs, take screenshots of each page + page_count = await page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + const container = document.querySelector('.drive-viewer-paginated-scrollable'); + + if (!container || pages.length === 0) return 0; + + // Scroll through to make sure all pages are loaded + const scrollHeight = container.scrollHeight; + const viewportHeight = container.clientHeight; + const scrollStep = viewportHeight; + + for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { + container.scrollTo(0, scrollPos); + await delay(300); + } + + // Scroll back to top + container.scrollTo(0, 0); + await delay(300); + + return pages.length; + } + """) + + logger.info(f"Found {page_count} pages in document") + + # Take screenshots of each page + screenshots = [] + for i in range(page_count): + # Scroll to page + await page.evaluate(f""" + async () => {{ + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length <= {i}) return false; - return save_path + pages[{i}].scrollIntoView(); + await delay(500); + return true; + }} + """) + + # Take screenshot + screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") + await page.screenshot(path=screenshot_path, clip={ + 'x': 0, + 'y': 0, + 'width': 1600, + 'height': 1200 + }) + screenshots.append(screenshot_path) + + # Combine screenshots into PDF + c = canvas.Canvas(save_path) + for screenshot in screenshots: + img = Image.open(screenshot) + width, height = img.size + + # Add page to PDF + c.setPageSize((width, height)) + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + + c.save() + + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) + os.rmdir(screenshots_dir) + + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + else: + # For single-page or non-PDF files, just take one screenshot + screenshot_path = os.path.join(screenshots_dir, "screenshot.png") + await page.screenshot(path=screenshot_path, fullPage=True) + + # Convert to requested format if needed + if file_type == 'pdf': + # Create PDF from screenshot + img = Image.open(screenshot_path) + width, height = img.size + + c = canvas.Canvas(save_path, pagesize=(width, height)) + c.drawImage(screenshot_path, 0, 0, width, height) + c.save() + else: + # Just copy the screenshot to the destination with proper extension + shutil.copy(screenshot_path, save_path) + + # Clean up + os.remove(screenshot_path) + os.rmdir(screenshots_dir) + + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + + except Exception as e: + logger.error(f"Error taking screenshots: {e}") + return False + + async def export_google_doc(self, file_id, file_type, save_path): + """Export Google Docs/Sheets/Slides to downloadable formats""" + try: + # Map file types to export formats + export_formats = { + 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx + 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx + 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx + 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'pdf': 'application/pdf', + } + + export_format = export_formats.get(file_type, 'application/pdf') + export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" + + if 'sheet' in file_type or 'xlsx' in file_type: + export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" + elif 'ppt' in file_type or 'presentation' in file_type: + export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" + elif file_type == 'pdf': + export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" + + async with self.context.new_page() as page: + # Get cookies from the main view page first + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') + + # Now try the export + response = await page.goto(export_url, wait_until='networkidle') + + if response.status == 200: + content = await response.body() + with open(save_path, 'wb') as f: + f.write(content) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + else: + logger.warning(f"Export failed with status {response.status}") + return False + + except Exception as e: + logger.error(f"Error exporting Google Doc: {e}") + return False + + async def get_google_drive_file_info(self, file_id): + """Get file type and view-only status from Google Drive""" + file_type = None + is_view_only = False + + try: + async with self.context.new_page() as page: + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + + # Check if view-only + view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') + is_view_only = view_only_text is not None + + # Check for Google Docs viewer + gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') + gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') + gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') + + if gdocs_viewer: + file_type = 'docx' + elif gsheets_viewer: + file_type = 'xlsx' + elif gslides_viewer: + file_type = 'pptx' + else: + # Check for PDF viewer + pdf_viewer = await page.query_selector('embed[type="application/pdf"]') + if pdf_viewer: + file_type = 'pdf' + else: + # Check for image viewer + img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') + if img_viewer: + # Get image type from src + img_src = await img_viewer.get_attribute('src') + if 'jpg' in img_src or 'jpeg' in img_src: + file_type = 'jpg' + elif 'png' in img_src: + file_type = 'png' else: - logger.error("No screenshots captured") + file_type = 'jpg' # Default to jpg else: - # For non-PDF files, just take a screenshot - screenshot_path = os.path.join(temp_dir, "file.png") - await page.screenshot(path=screenshot_path) - - # Copy to destination - shutil.copy(screenshot_path, save_path) - - # Clean up - os.remove(screenshot_path) - shutil.rmtree(temp_dir, ignore_errors=True) - - return save_path - finally: - await browser.close() - elif self.browser_engine == "pyppeteer": - # Similar implementation for Pyppeteer - pass - - return None + # Generic file type fallback + file_type = 'pdf' # Default to PDF + + # If still no type, check filename + if not file_type: + title_element = await page.query_selector('div[role="heading"]') + if title_element: + title = await title_element.text_content() + if title: + ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) + if ext_match: + file_type = ext_match.group(1).lower() + except Exception as e: - logger.error(f"Error downloading view-only file: {e}") - return None + logger.error(f"Error getting Google Drive file info: {e}") + file_type = 'pdf' # Default to PDF if we can't determine + + return file_type, is_view_only + # IMPROVED: Enhanced sublink extraction method async def get_sublinks(self, url, limit=10000): - """Extract all sublinks from a website""" + """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" links = set() try: - logger.info(f"Extracting sublinks from {url}") + logger.info(f"Fetching sublinks from: {url}") - # Special handling for educational sites + # Special handling for educational sites like phsms.cloud.ncnu.edu.tw if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): + ["exam", "test", "pastpaper", "eduexp"]): + logger.info("Using specialized exam site sublink extraction") edu_links = await self.get_edu_exam_links(url) for link in edu_links: links.add(link) + # If we found a good number of links with the specialized method, return them if len(links) > 5: logger.info(f"Found {len(links)} sublinks with specialized method") return list(links)[:limit] - # Standard link extraction for all sites - await self.browser.goto(url, timeout=30000) + # Rotate proxy if needed + await self.rotate_proxy_if_needed() - # Get page content - content = await self.browser.content() - soup = BeautifulSoup(content, 'html.parser') + # Standard sublink extraction for all sites + await self.page.goto(url, timeout=30000, wait_until='networkidle') # Get base URL for resolving relative links parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + path_base = os.path.dirname(parsed_base.path) - # Extract all links from the page - for a in soup.find_all('a', href=True): - href = a['href'] - if href and not href.startswith('javascript:') and not href.startswith('#'): - # Resolve relative URLs - if href.startswith('/'): - full_url = f"{base_url}{href}" - elif href.startswith('http'): - full_url = href - else: - full_url = urljoin(url, href) + # Perform initial scrolling to load lazy content + await self.page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const step = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += step) { + window.scrollTo(0, i); + await delay(150); + } - links.add(full_url) + window.scrollTo(0, 0); + } + """) + await self.page.wait_for_timeout(1000) - # Extract iframe sources - for iframe in soup.find_all('iframe', src=True): - src = iframe['src'] - if src and not src.startswith('javascript:') and not src.startswith('about:'): - full_url = src if src.startswith('http') else urljoin(url, src) - links.add(full_url) + # Check if page has ASP.NET elements which might need special handling + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') - return list(links)[:limit] - except Exception as e: - logger.error(f"Error extracting sublinks: {e}") - return list(links)[:limit] - - @celery_app.task - def download_file_task(file_info, save_dir, referer=None): - """Celery task for downloading files asynchronously""" - # This function runs in a separate worker process - file_url = file_info['url'] - fname = file_info['filename'] - referer = referer or file_info.get('source_url', 'https://www.google.com') - - # Create unique filename - path = os.path.join(save_dir, fname) - base, ext = os.path.splitext(fname) - counter = 1 - while os.path.exists(path): - path = os.path.join(save_dir, f"{base}_{counter}{ext}") - counter += 1 - - os.makedirs(save_dir, exist_ok=True) - - try: - # Handle Google Drive files - if "drive.google.com" in file_url or "docs.google.com" in file_url: - # Extract file ID - file_id = None - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, file_url) - if match: - file_id = match.group(1) - break + if is_aspnet: + logger.info("Detected ASP.NET page, using enhanced extraction method") + + # Try to interact with ASP.NET controls that might reveal more links + # Look for dropdowns, buttons, and grid elements + dropdowns = await self.page.query_selector_all('select') + buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') + + # Try interacting with dropdowns first + for dropdown in dropdowns: + try: + # Get all options + options = await self.page.evaluate(''' + (dropdown) => { + return Array.from(dropdown.options).map(o => o.value); + } + ''', dropdown) + + # Try selecting each option + for option in options: + if option: + await dropdown.select_option(value=option) + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error interacting with dropdown: {e}") + + # Try clicking buttons (but avoid dangerous ones like "delete") + safe_buttons = [] + for button in buttons: + button_text = await button.text_content() or "" + button_value = await button.get_attribute("value") or "" + button_id = await button.get_attribute("id") or "" + combined_text = (button_text + button_value + button_id).lower() + + # Skip potentially destructive buttons + if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): + continue + + # Prioritize buttons that might show more content + if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): + safe_buttons.append(button) - if file_id: - # Try direct download - download_url = f"https://drive.google.com/uc?id={file_id}&export=download" - headers = { - 'User-Agent': get_random_user_agent(), - 'Referer': referer + # Click the safe buttons + for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks + try: + await button.click() + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Extract links from the initial page state + await self.extract_all_link_types(links, base_url, path_base) + + # Look specifically for links inside grid/table views which are common in ASP.NET applications + grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') + for cell in grid_cells: + try: + href = await cell.get_attribute('href') + if href: + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links.add(full_url) + except Exception as e: + logger.warning(f"Error extracting grid link: {e}") + + # Extract links from onclick attributes and javascript:__doPostBack calls + postback_links = await self.page.evaluate(''' + () => { + const results = []; + // Find elements with onclick containing __doPostBack + const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); + for (const el of elements) { + // Extract the postback target + const onclick = el.getAttribute('onclick') || ''; + const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); + if (match && match[1]) { + // Get the visible text to use as description + const text = el.innerText || el.textContent || 'Link'; + results.push({ + id: match[1], + text: text.trim() + }); + } } + return results; + } + ''') + + # Try interacting with some of the postback links + for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions + try: + logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") + await self.page.evaluate(f''' + () => {{ + if (typeof __doPostBack === 'function') {{ + __doPostBack('{postback["id"]}', ''); + }} + }} + ''') + await self.page.wait_for_timeout(1500) + await self.page.wait_for_load_state('networkidle', timeout=5000) - with requests.get(download_url, headers=headers, stream=True) as r: - if r.status_code == 200: - with open(path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - - # Check if this is HTML (common for Google Drive restrictions) - with open(path, 'rb') as f: - content_start = f.read(100).decode('utf-8', errors='ignore') - if '" == el_text.strip() or "→" == el_text.strip(): + logger.info(f"Clicking pagination control: {el_text}") + await el.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Get new links from this page + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking pagination: {e}") + + # Check for hidden links that might be revealed by JavaScript + hidden_links = await self.page.evaluate(""" + () => { + // Try to execute common JavaScript patterns that reveal hidden content + try { + // Common patterns used in websites to initially hide content + const hiddenContainers = document.querySelectorAll( + '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' + ); + + // Attempt to make them visible + hiddenContainers.forEach(el => { + el.style.display = 'block'; + el.style.visibility = 'visible'; + el.classList.remove('hidden', 'hide'); + }); + + // Return any newly visible links + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } catch (e) { + return []; + } + } + """) + + # Add any newly discovered links + for href in hidden_links: + if href and not href.startswith('javascript:'): + links.add(href) + + logger.info(f"Found {len(links)} sublinks") + return list(links)[:limit] + except Exception as e: - return {'status': 'error', 'message': str(e)} + logger.error(f"Error getting sublinks from {url}: {e}") + return list(links)[:limit] # Return what we have so far + + async def extract_all_link_types(self, links_set, base_url, path_base): + """Extract all types of links from the current page""" + # Get all tag links + a_links = await self.page.query_selector_all('a[href]') + for a in a_links: + try: + href = await a.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get iframe sources + iframes = await self.page.query_selector_all('iframe[src]') + for iframe in iframes: + try: + src = await iframe.get_attribute('src') + if src and not src.startswith('javascript:') and not src.startswith('about:'): + full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get links from onclick attributes that reference URLs + onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') + for el in onclick_elements: + try: + onclick = await el.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url in urls: + links_set.add(url) + except Exception: + pass + + # Look for URLs in data-* attributes + data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') + for el in data_elements: + for attr in ['data-url', 'data-href', 'data-src']: + try: + value = await el.get_attribute(attr) + if value and not value.startswith('javascript:'): + full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Look for special anchor links that might not have href attributes + special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') + for anchor in special_anchors: + try: + href = await anchor.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Extract links from JSON data embedded in the page + script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') + for script in script_elements: + try: + script_content = await script.text_content() + if script_content: + # Look for URLs in the JSON content + urls = re.findall(r'(https?://[^\'"]+)', script_content) + for url in urls: + links_set.add(url) + except Exception: + pass + + def resolve_relative_url(self, relative_url, base_url, path_base): + """Properly resolve relative URLs considering multiple formats""" + if relative_url.startswith('/'): + # Absolute path relative to domain + return f"{base_url}{relative_url}" + elif relative_url.startswith('./'): + # Explicit relative path + return f"{base_url}{path_base}/{relative_url[2:]}" + elif relative_url.startswith('../'): + # Parent directory + parent_path = '/'.join(path_base.split('/')[:-1]) + return f"{base_url}{parent_path}/{relative_url[3:]}" + else: + # Regular relative path + return f"{base_url}{path_base}/{relative_url}" async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): - """Perform deep search for files on a website and its subpages""" if not custom_ext_list: custom_ext_list = [] - - # Create progress indicators progress_text = st.empty() progress_bar = st.progress(0) file_count_text = st.empty() try: progress_text.text("Analyzing main page...") + # Special handling for ASP.NET pages + is_aspnet = False + try: + await self.page.goto(url, timeout=30000, wait_until='networkidle') + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') + except Exception: + pass - # Extract files from main page first + # Extract files from main page main_files = await self.extract_downloadable_files(url, custom_ext_list) initial_count = len(main_files) file_count_text.text(f"Found {initial_count} files on main page") - # Get sublinks + # Get sublinks with enhanced method progress_text.text("Getting sublinks...") sublinks = await self.get_sublinks(url, sublink_limit) total_links = len(sublinks) progress_text.text(f"Found {total_links} sublinks to process") - # Initialize all_files with main_files to ensure they're included - all_files = main_files.copy() + # Always include files from the main page, regardless of sublinks + all_files = main_files + + if not sublinks: + progress_bar.progress(1.0) + return all_files # Process each sublink for i, sublink in enumerate(sublinks, 1): - progress = i / max(total_links, 1) # Avoid division by zero + progress = i / total_links progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") progress_bar.progress(progress) try: + # Use a longer timeout for ASP.NET pages which can be slower + sub_timeout = timeout * 2 if is_aspnet else timeout + # Extract files from sublink sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) all_files.extend(sub_files) @@ -1379,13 +2689,12 @@ class DownloadManager: seen_urls.add(f['url']) unique_files.append(f) - # Complete progress + final_count = len(unique_files) progress_text.text(f"Deep search complete!") - file_count_text.text(f"Found {len(unique_files)} unique files") + file_count_text.text(f"Found {final_count} unique files") progress_bar.progress(1.0) - return unique_files - + except Exception as e: logger.error(f"Deep search error: {e}") progress_text.text(f"Error during deep search: {str(e)}") @@ -1401,7 +2710,12 @@ class DownloadManager: def main(): st.title("Advanced File Downloader") - # Initialize session state + # Initialize playwright if needed + if "playwright_installed" not in st.session_state: + with st.spinner("Setting up browser automation. This may take a minute..."): + install_playwright_dependencies() + st.session_state.playwright_installed = True + if "initialized" not in st.session_state: st.session_state.initialized = True st.session_state.discovered_files = [] @@ -1411,44 +2725,17 @@ def main(): st.session_state.do_deep_search = False st.session_state.deep_search_url = None st.session_state.search_results = [] - st.session_state.download_urls = {} # For direct download links - - # Install dependencies if needed - if "dependencies_installed" not in st.session_state: - with st.spinner("Setting up dependencies. This may take a minute..."): - st.session_state.dependencies_installed = setup_dependencies() - check_services() - - # Sidebar options + with st.sidebar: - mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select") - - with st.expander("Search Options", expanded=True): - search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine") - browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine") - custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", - help="Enter extensions like .csv, .txt") - max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks") - sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout") - - with st.expander("Advanced Options", expanded=False): - use_proxy = st.checkbox("Use Proxy", key="use_proxy") + mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select") + with st.expander("Advanced Options", expanded=True): + custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt") + max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page") + sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink") + use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") - use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth", - help="Makes browser harder to detect as automated") - enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"], - key="enable_intercept", - help="Intercept network traffic to find additional files") - if enable_network_intercept: - NETWORK_INTERCEPTOR_CONFIG["enabled"] = True - intercept_types = st.multiselect("Intercept Types", - ["xhr", "fetch", "document", "media", "stylesheet", "image", "font"], - default=["xhr", "fetch", "document", "media"], - key="intercept_types") - NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types - else: - NETWORK_INTERCEPTOR_CONFIG["enabled"] = False - + use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox") + with st.expander("Google Drive Integration", expanded=False): if st.button("Start Google Sign-In", key="google_signin_btn"): auth_url = get_google_auth_url() @@ -1458,75 +2745,97 @@ def main(): creds, msg = exchange_code_for_credentials(auth_code) st.session_state.google_creds = creds st.write(msg) - - # Main content area + + with st.expander("Advanced Browser Settings", expanded=False): + # Captcha handling options + st.write("**Captcha Handling**") + captcha_option = st.radio( + "Captcha Detection:", + ["Auto-detect only", "Manual solve (shows captcha)"], + index=0, + key="captcha_option" + ) + + # Proxy rotation settings + st.write("**Proxy Rotation**") + enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation") + if enable_rotation: + PROXY_ROTATION_CONFIG["enabled"] = True + proxy_list = st.text_area( + "Proxy List (one per line)", + placeholder="http://proxy1:port\nhttp://proxy2:port", + key="proxy_list" + ) + if proxy_list: + PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()] + rotation_interval = st.slider( + "Rotation Interval (# of requests)", + min_value=1, + max_value=50, + value=10, + key="rotation_interval" + ) + PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + if mode == "Manual URL": st.header("Manual URL Mode") - url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input") - + url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") col1, col2 = st.columns([3, 1]) with col1: if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): if url: - # Process custom extensions custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] + valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] + if custom_ext_list != valid_ext_list: + st.warning("Invalid extensions ignored. Use format like '.csv'.") - with st.spinner("Searching for files..."): - async def run_deep_search(): + @st.cache_resource + def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): + async def _run(): async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth + use_proxy=use_proxy_val, + proxy=proxy_val, + use_stealth=use_stealth_val ) as dm: - files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) + files = await dm.deep_search(url, ext_list, max_links, timeout_val) return files - - # Run the search - files = asyncio.run(run_deep_search()) - - if files: - st.session_state.discovered_files = files - st.session_state.current_url = url - st.success(f"Found {len(files)} files!") - else: - st.warning("No files found.") - - # Display and process discovered files + return asyncio.run(_run()) + + with st.spinner("Searching for files..."): + files = run_deep_search(url, valid_ext_list, max_sublinks, + sublink_timeout, use_proxy, proxy, use_stealth) + + if files: + st.session_state.discovered_files = files + st.session_state.current_url = url + st.success(f"Found {len(files)} files!") + else: + st.warning("No files found.") + if st.session_state.discovered_files: files = st.session_state.discovered_files - - # Select/deselect buttons - col1, col2 = st.columns([1, 1]) + col1, col2 = st.columns([1, 4]) with col1: if st.button("Select All", key="select_all_btn"): st.session_state.selected_files = list(range(len(files))) - with col2: if st.button("Clear Selection", key="clear_selection_btn"): st.session_state.selected_files = [] - # Display file list with metadata + # Create a formatted display of files with metadata file_options = [] for i, file in enumerate(files): filename = file['filename'] size = file['size'] meta = file.get('metadata', {}) - # Format display info + # Format display string with relevant metadata if meta and 'Pages' in meta: file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages" else: file_info = f"{filename} ({size})" - + file_options.append((i, file_info)) - - # Generate direct download URL for this file - if i not in st.session_state.download_urls: - # Generate a unique key for this file - file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode() - st.session_state.download_urls[i] = file_key - # File selection multiselect selected_indices = st.multiselect( "Select files to download", options=[i for i, _ in file_options], @@ -1537,341 +2846,215 @@ def main(): st.session_state.selected_files = selected_indices - # Display individual files with direct download links - if files: - st.subheader("Available Files") - for i, file in enumerate(files): - with st.expander(f"{i+1}. {file['filename']} ({file['size']})"): - st.write(f"Source: {file.get('source_url', 'Unknown')}") - st.write(f"URL: {file['url']}") - - # Download button for this specific file - if st.button(f"Download this file", key=f"download_single_{i}"): - with st.spinner(f"Downloading {file['filename']}..."): - # Create downloads directory - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - - # Download the file - async def download_single(): - async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - return await dm.download_file(file, download_dir) - - file_path = asyncio.run(download_single()) - - if file_path: - # Create a download link - with open(file_path, "rb") as f: - file_bytes = f.read() - - file_name = os.path.basename(file_path) - mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" - - st.download_button( - label=f"Download {file_name}", - data=file_bytes, - file_name=file_name, - mime=mime_type, - key=f"download_btn_{i}" - ) - - st.success(f"Downloaded successfully to {file_path}") - else: - st.error(f"Failed to download {file['filename']}") - - # Batch download options if selected_indices: - st.subheader("Batch Download Options") - col1, col2, col3, col4 = st.columns(4) with col1: download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") with col2: create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") with col3: - delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox") + delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") with col4: upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") - if st.button("Download Selected Files", key="batch_download_btn"): - with st.spinner(f"Downloading {len(selected_indices)} files..."): - if not os.path.exists(download_dir): - os.makedirs(download_dir) - - # Start download process + if st.button("Download Selected", key="download_btn"): + if not os.path.exists(download_dir): + os.makedirs(download_dir) + + async def download_files(): downloaded_paths = [] progress_bar = st.progress(0) status_text = st.empty() - async def download_batch(): - async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - paths = [] - for i, idx in enumerate(selected_indices): - file_info = files[idx] - progress = (i + 1) / len(selected_indices) - status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") - progress_bar.progress(progress) - - path = await dm.download_file(file_info, download_dir) - if path: - paths.append(path) + async with DownloadManager( + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth + ) as dm: + for i, idx in enumerate(selected_indices): + progress = (i + 1) / len(selected_indices) + file_info = files[idx] + status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") + progress_bar.progress(progress) - return paths - - downloaded_paths = asyncio.run(download_batch()) - status_text.empty() - progress_bar.empty() + path = await dm.download_file(file_info, download_dir, url) + if path: + downloaded_paths.append(path) + + status_text.empty() + progress_bar.empty() + return downloaded_paths + + with st.spinner("Downloading files..."): + downloaded = asyncio.run(download_files()) + + if downloaded: + st.success(f"Successfully downloaded {len(downloaded)} files") - if downloaded_paths: - st.success(f"Successfully downloaded {len(downloaded_paths)} files") + if create_zip: + zip_path = create_zip_file(downloaded, download_dir) + st.success(f"Created ZIP file: {zip_path}") - if create_zip: - zip_path = create_zip_file(downloaded_paths, download_dir) - st.success(f"Created ZIP file: {zip_path}") + # Provide download link for the zip file + with open(zip_path, "rb") as f: + zip_data = f.read() + + st.download_button( + label="Download ZIP", + data=zip_data, + file_name=os.path.basename(zip_path), + mime="application/zip", + key="download_zip_btn" + ) + + # Upload to Google Drive if requested + if upload_to_drive and st.session_state.google_creds: + drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds) + folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}") + drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id) + if not isinstance(drive_id, str) or not drive_id.startswith("Error"): + st.success(f"Uploaded to Google Drive. File ID: {drive_id}") + else: + st.error(drive_id) + + # Delete original files if requested + if delete_after: + for path in downloaded: + try: + os.remove(path) + except Exception as e: + st.warning(f"Could not delete {path}: {e}") + st.info("Deleted original files after ZIP creation") + else: + # Provide individual file downloads + st.write("Download files individually:") + for path in downloaded: + with open(path, "rb") as f: + file_data = f.read() - # Provide download link for the zip file - with open(zip_path, "rb") as f: - zip_data = f.read() + file_name = os.path.basename(path) + mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream" st.download_button( - label="Download ZIP", - data=zip_data, - file_name=os.path.basename(zip_path), - mime="application/zip", - key="download_zip_btn" + label=f"Download {file_name}", + data=file_data, + file_name=file_name, + mime=mime_type, + key=f"download_file_{path}" ) - - # Upload to Google Drive if requested - if upload_to_drive and st.session_state.google_creds: - with st.spinner("Uploading to Google Drive..."): - drive_service = googleapiclient.discovery.build( - "drive", "v3", credentials=st.session_state.google_creds - ) - folder_id = create_drive_folder( - drive_service, f"Downloads_{get_domain(url)}" - ) - drive_id = google_drive_upload( - zip_path, st.session_state.google_creds, folder_id - ) - - if not isinstance(drive_id, str) or not drive_id.startswith("Error"): - st.success(f"Uploaded to Google Drive. File ID: {drive_id}") - else: - st.error(drive_id) - - # Delete original files if requested - if delete_after: - for path in downloaded_paths: - try: - os.remove(path) - except Exception as e: - st.warning(f"Could not delete {path}: {e}") - st.info("Deleted original files after ZIP creation") - - elif mode == "Web Search": - st.header("Web Search Mode") - - # Search query input - query = st.text_input("Enter search query", placeholder="example file type:pdf", key="search_query") - num_results = st.slider("Number of results", 1, 50, 10, key="num_results") + + elif mode == "Bing Search": + st.header("Bing Search Mode") + query = st.text_input("Enter search query", key="search_query_input") + num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") - if st.button("Search", key="web_search_btn"): + if st.button("Search", key="search_btn"): if query: - with st.spinner("Searching the web..."): - async def run_search(): - async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - query=query, - num_results=num_results, - use_stealth=use_stealth - ) as dm: - urls = await dm.search_web(search_engine) - return urls - - urls = asyncio.run(run_search()) - - if urls: - st.session_state.search_results = urls - st.success(f"Found {len(urls)} results!") - - # Display search results with deep search option - for i, url in enumerate(urls, 1): - with st.expander(f"Result {i}: {url}", expanded=(i == 1)): - st.write(f"URL: {url}") - if st.button(f"Search for files", key=f"search_result_{i}"): - st.session_state.deep_search_url = url - st.session_state.do_deep_search = True - else: - st.warning("No search results found.") - - # Handle deep search of a result if requested - if st.session_state.do_deep_search and st.session_state.deep_search_url: - url = st.session_state.deep_search_url - st.info(f"Searching for files on: {url}") - - # Reset the search flag to avoid re-running - st.session_state.do_deep_search = False - - # Process custom extensions - custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] - - with st.spinner("Searching for files..."): - async def deep_search_result(): + async def run_search(): async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, + use_proxy=use_proxy, + proxy=proxy, + query=query, + num_results=num_results, use_stealth=use_stealth ) as dm: - return await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) + with st.spinner("Searching..."): + urls = await dm.search_bing() + if urls: + st.session_state.search_results = urls + st.success(f"Found {len(urls)} results!") + + # Create expanders for each result + for i, url in enumerate(urls, 1): + with st.expander(f"Result {i}: {url}", expanded=(i == 1)): + if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"): + st.session_state.deep_search_url = url + st.session_state.do_deep_search = True + else: + st.warning("No search results found.") + + asyncio.run(run_search()) + + # Handle deep search based on search results + if st.session_state.do_deep_search and st.session_state.deep_search_url: + url = st.session_state.deep_search_url + st.info(f"Deep searching: {url}") + + # Reset the flag to avoid re-running + st.session_state.do_deep_search = False - files = asyncio.run(deep_search_result()) + # Set up custom extensions + custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] + valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] + + @st.cache_resource + def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): + async def _run(): + async with DownloadManager( + use_proxy=use_proxy_val, + proxy=proxy_val, + use_stealth=use_stealth_val + ) as dm: + files = await dm.deep_search(url, ext_list, max_links, timeout_val) + return files + return asyncio.run(_run()) + + with st.spinner("Searching for files..."): + files = run_deep_search(url, valid_ext_list, max_sublinks, + sublink_timeout, use_proxy, proxy, use_stealth) if files: st.session_state.discovered_files = files st.session_state.current_url = url st.success(f"Found {len(files)} files!") else: - st.warning("No files found on this page.") - - elif mode == "Single File": - st.header("Single File Download") - - # View-only Google Drive download - with st.expander("Download View-Only Google Drive Document", expanded=True): - st.write("Download protected/view-only Google Drive documents") - - file_id = st.text_input( - "Google Drive File ID", - placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view", - key="drive_file_id" - ) - - if st.button("Download Document", key="drive_download_btn") and file_id: - with st.spinner("Downloading view-only document... (this may take a minute)"): - # Create download directory - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - - # Set output path - output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") - - # Download the file - async def download_drive_file(): - async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - file_info = { - 'url': f"https://drive.google.com/file/d/{file_id}/view", - 'filename': f"gdrive_{file_id}.pdf", - 'metadata': {'file_id': file_id, 'view_only': True} - } - return await dm.download_viewonly_google_drive(file_info, output_path) - - result_path = asyncio.run(download_drive_file()) - - if result_path: - st.success("Document downloaded successfully!") - - # Provide download link - with open(result_path, "rb") as f: - file_bytes = f.read() - - st.download_button( - label="Download PDF", - data=file_bytes, - file_name=os.path.basename(result_path), - mime="application/pdf", - key="drive_pdf_download" - ) - else: - st.error("Failed to download the document. Please check the file ID and try again.") + st.warning("No files found.") + + # Add a special section for direct Google Drive file download + st.markdown("---") + with st.expander("Download View-Only Google Drive Document", expanded=False): + st.write("Download protected/view-only Google Drive documents - just enter the file ID") + file_id = st.text_input("Google Drive File ID", + placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku", + help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')") - # Direct URL download - with st.expander("Download from Direct URL", expanded=True): - st.write("Download a file from a direct URL") - - file_url = st.text_input( - "File URL", - placeholder="https://example.com/file.pdf", - key="direct_url" - ) + if st.button("Download Document") and file_id: + download_dir = "./downloads" + os.makedirs(download_dir, exist_ok=True) + output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") - file_name = st.text_input( - "Save as (optional)", - placeholder="Leave blank to use original filename", - key="save_filename" - ) - - if st.button("Download File", key="direct_download_btn") and file_url: - with st.spinner("Downloading file..."): - # Create download directory - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - - # Determine filename - if not file_name: - file_name = os.path.basename(urlparse(file_url).path) - if not file_name or file_name == '/': - file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}" - - # Create file info - file_info = { - 'url': file_url, - 'filename': file_name, - 'metadata': {} - } - - # Download the file - async def download_direct_file(): - async with DownloadManager( - browser_engine=browser_engine, - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - return await dm.download_file(file_info, download_dir) + with st.spinner("Downloading view-only document... (this may take a minute)"): + async def download_viewonly(): + async with DownloadManager(use_stealth=use_stealth) as dm: + file_info = { + 'url': f"https://drive.google.com/file/d/{file_id}/view", + 'filename': f"gdrive_{file_id}.pdf", + 'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True} + } + result_path = await dm.force_download_viewonly(file_info, output_path) + return result_path + + result = asyncio.run(download_viewonly()) + + if result: + st.success("Document downloaded successfully!") - file_path = asyncio.run(download_direct_file()) + # Provide download button + with open(result, "rb") as f: + file_bytes = f.read() - if file_path: - st.success(f"File downloaded successfully to {file_path}") - - # Provide download link - with open(file_path, "rb") as f: - file_bytes = f.read() - - mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" - - st.download_button( - label=f"Download {os.path.basename(file_path)}", - data=file_bytes, - file_name=os.path.basename(file_path), - mime=mime_type, - key="direct_file_download" - ) - else: - st.error("Failed to download the file. Please check the URL and try again.") - - # Footer - st.markdown("---") - st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies") + st.download_button( + label="Download PDF", + data=file_bytes, + file_name=f"gdrive_{file_id}.pdf", + mime="application/pdf" + ) + else: + st.error("Failed to download the document. Please check the file ID and try again.") + + # Add footer with attribution + st.markdown('---') + st.markdown('Created by [Euler314](https://github.com/euler314)') -# Run the app if __name__ == "__main__": main() \ No newline at end of file