diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,45 +1,51 @@ import streamlit as st -st.set_page_config(page_title="Advanced File Downloader", layout="wide") - -# Core imports import os -import subprocess -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError import asyncio +import subprocess +import tempfile import logging -from urllib.parse import urlparse, urljoin, unquote +import time +import json +import base64 import re -from pathlib import Path -from io import BytesIO import random -from bs4 import BeautifulSoup -from PyPDF2 import PdfReader import zipfile -import tempfile -import mimetypes -import requests import datetime import traceback -import base64 import shutil -import json -import time +import mimetypes +from pathlib import Path +from urllib.parse import urlparse, urljoin, unquote +from io import BytesIO from PIL import Image from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas + +# Advanced imports +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError +from bs4 import BeautifulSoup +from PyPDF2 import PdfReader import google_auth_oauthlib.flow import googleapiclient.discovery import google.auth.transport.requests import googleapiclient.http +import requests +import celery +from celery import Celery +import splash +import pyppeteer +import mitmproxy +from mitmproxy import http -# -------------------- Logging Setup -------------------- -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) +# Configure page and logging +st.set_page_config(page_title="Advanced File Downloader", layout="wide") +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) -# -------------------- Google OAuth Config -------------------- +# Initialize Celery for distributed task processing +celery_app = Celery('file_downloader', broker='redis://localhost:6379/0') + +# Configure Google OAuth GOOGLE_OAUTH_CONFIG = { "web": { "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", @@ -52,8 +58,7 @@ GOOGLE_OAUTH_CONFIG = { } } -# -------------------- Stealth and UA Settings -------------------- -# Extended user agent list for better variety +# -------------------- User Agent Settings -------------------- USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', @@ -62,30 +67,18 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' ] -# Stealth browser settings -STEALTH_SETTINGS = { - # Hardware features to modify/disable - "hardware_concurrency": 4, - "device_memory": 8, - # Browser features to enable/disable - "webgl_vendor": "Google Inc. (Intel)", - "webgl_renderer": "Intel Iris OpenGL Engine", - "languages": ["en-US", "en"], - "disable_webrtc": True, - # Additional timing randomization - "navigator_platform": "Win32", - "touch_support": False -} +# -------------------- Proxy Management -------------------- +PROXY_POOL = [] +CURRENT_PROXY_INDEX = 0 -# Proxy rotation configuration (if using multiple proxies) -PROXY_ROTATION_CONFIG = { - "enabled": False, # Set to True to enable rotation - "rotation_interval": 10, # Rotate every 10 requests - "proxies": [] # Will be populated from the UI if needed +# -------------------- Network Interception Configuration -------------------- +NETWORK_INTERCEPTOR_CONFIG = { + "enabled": False, + "intercept_types": ["xhr", "fetch", "document", "media"], + "save_intercepted": True, + "intercept_folder": "./intercepted_data" } # -------------------- Utility Functions -------------------- @@ -115,16 +108,6 @@ def get_file_extension(url, default='.pdf'): return default return ext -def humanize_file_size(size_bytes): - """Format file size in human-readable format""" - if size_bytes < 1024: - return f"{size_bytes} bytes" - for unit in ['KB', 'MB', 'GB', 'TB']: - size_bytes /= 1024.0 - if size_bytes < 1024.0: - return f"{size_bytes:.1f} {unit}" - return f"{size_bytes:.1f} PB" - def get_domain(url): """Extract domain from URL""" parsed = urlparse(url) @@ -134,15 +117,6 @@ def is_valid_file_url(url, extensions): """Check if URL is a valid file URL based on extension""" return any(url.lower().endswith(ext) for ext in extensions) -def detect_captcha(html_content): - """Detect common captcha patterns in HTML content""" - captcha_patterns = [ - 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', - 'challenge', 'solve the following', 'verify you are human' - ] - html_lower = html_content.lower() - return any(pattern in html_lower for pattern in captcha_patterns) - # -------------------- Google Drive Functions -------------------- def get_google_auth_url(): client_config = GOOGLE_OAUTH_CONFIG["web"] @@ -193,394 +167,470 @@ def create_drive_folder(drive_service, name): folder = drive_service.files().create(body=folder_metadata, fields='id').execute() return folder.get('id') -# -------------------- Playwright Setup -------------------- -def install_playwright_dependencies(): +# -------------------- Setup Functions -------------------- +def setup_dependencies(): + """Install required system dependencies""" try: - # Set environment variable for Playwright browsers path - os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") - # Install system dependencies subprocess.run(['apt-get', 'update', '-y'], check=True) packages = [ 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', - 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' + 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0', + 'redis-server', 'python3-dev', 'build-essential' ] subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) - # Install Playwright and dependencies - subprocess.run(['pip', 'install', 'playwright'], check=True) + # Install Python packages + subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True) + + # Install browsers subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) + subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True) - st.success("Playwright dependencies installed successfully!") + st.success("Dependencies installed successfully!") + return True except Exception as e: - st.error(f"Error installing Playwright dependencies: {e}") + st.error(f"Error installing dependencies: {e}") st.info("You may need to manually install dependencies. Check console for details.") - logger.error(f"Playwright setup error: {e}") + logger.error(f"Setup error: {e}") traceback.print_exc() + return False -# -------------------- Download Manager Class -------------------- -class DownloadManager: - def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): +def check_services(): + """Check if required services are running""" + try: + # Check Redis for Celery + redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG' + if not redis_running: + # Try to start Redis + subprocess.run(['service', 'redis-server', 'start'], check=True) + + # Create directories for intercepted data + os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True) + + return True + except Exception as e: + logger.error(f"Service check error: {e}") + return False + +# -------------------- Network Interception Classes -------------------- +class NetworkInterceptor: + """Class to intercept network traffic using mitmproxy""" + + def __init__(self, intercept_types=None, save_path=None): + self.intercept_types = intercept_types or ["xhr", "fetch", "document"] + self.save_path = save_path or "./intercepted_data" + os.makedirs(self.save_path, exist_ok=True) + self.captured_data = [] + + def intercept_request(self, flow): + """Process intercepted requests""" + try: + url = flow.request.url + method = flow.request.method + content_type = flow.request.headers.get("Content-Type", "") + + # Log the request + self.captured_data.append({ + "type": "request", + "url": url, + "method": method, + "headers": dict(flow.request.headers), + "timestamp": time.time() + }) + + logger.info(f"Intercepted {method} request to {url}") + except Exception as e: + logger.error(f"Error intercepting request: {e}") + + def intercept_response(self, flow): + """Process intercepted responses""" + try: + url = flow.request.url + status_code = flow.response.status_code + content_type = flow.response.headers.get("Content-Type", "") + + # Only process responses of interest based on content type + if any(t in content_type.lower() for t in ["application/pdf", "application/msword", + "application/vnd.openxmlformats", + "application/zip"]): + # Save the file + filename = os.path.basename(urlparse(url).path) + if not filename or filename == '/': + filename = f"file_{int(time.time())}" + + # Try to add extension based on content type + if "pdf" in content_type: + filename += ".pdf" + elif "msword" in content_type: + filename += ".doc" + elif "openxmlformats" in content_type and "wordprocessingml" in content_type: + filename += ".docx" + elif "zip" in content_type: + filename += ".zip" + + file_path = os.path.join(self.save_path, filename) + with open(file_path, "wb") as f: + f.write(flow.response.content) + + logger.info(f"Saved intercepted file: {file_path}") + + # Record metadata about the captured file + self.captured_data.append({ + "type": "file", + "url": url, + "content_type": content_type, + "size": len(flow.response.content), + "path": file_path, + "timestamp": time.time() + }) + except Exception as e: + logger.error(f"Error intercepting response: {e}") + + def get_captured_files(self): + """Return list of captured files""" + return [item for item in self.captured_data if item["type"] == "file"] + +# -------------------- Browser Automation Classes -------------------- +class MultiEngineBrowser: + """Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)""" + + def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True): + self.engine = engine self.use_proxy = use_proxy self.proxy = proxy - self.query = query - self.num_results = num_results - self.playwright = None + self.stealth = stealth self.browser = None self.context = None self.page = None - self.use_stealth = use_stealth - self.proxy_rotation = proxy_rotation - self.request_count = 0 - self.captcha_detected = False - self.download_timeout = 300 # 5 minutes timeout for downloads - - async def __aenter__(self): - self.playwright = await async_playwright().start() + + async def setup(self): + """Initialize browser based on selected engine""" + if self.engine == "playwright": + return await self.setup_playwright() + elif self.engine == "pyppeteer": + return await self.setup_pyppeteer() + elif self.engine == "splash": + return await self.setup_splash() + else: + raise ValueError(f"Unsupported browser engine: {self.engine}") + + async def setup_playwright(self): + """Setup Playwright browser""" + from playwright.async_api import async_playwright - # Prepare browser args with stealth settings + self.playwright = await async_playwright().start() browser_args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', - '--disable-gpu', - '--no-zygote', - '--single-process', '--disable-web-security', - '--disable-features=IsolateOrigins', - '--disable-site-isolation-trials' + '--disable-features=IsolateOrigins,site-per-process', ] - # Add stealth-specific args - if self.use_stealth: + if self.stealth: browser_args.extend([ '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-webgl', - '--disable-webrtc' + '--disable-features=IsolateOrigins' ]) - # Setup browser options - opts = { + launch_options = { "headless": True, "args": browser_args } - # Configure proxy if specified if self.use_proxy and self.proxy: - opts["proxy"] = {"server": self.proxy} + launch_options["proxy"] = {"server": self.proxy} - # Launch browser with options - self.browser = await self.playwright.chromium.launch(**opts) + self.browser = await self.playwright.chromium.launch(**launch_options) - # Setup browser context with enhanced settings - context_opts = { - "user_agent": get_random_user_agent(), + context_options = { "viewport": {"width": 1920, "height": 1080}, - "device_scale_factor": 1, - "has_touch": False, - "is_mobile": False, + "user_agent": get_random_user_agent(), + "bypass_csp": True, "ignore_https_errors": True, "accept_downloads": True } - # Apply stealth-specific settings to the context - if self.use_stealth: - # Apply JS-injection for enhanced stealth - context_opts["bypass_csp"] = True - self.context = await self.browser.new_context(**context_opts) - - # Execute stealth JS to avoid detection + self.context = await self.browser.new_context(**context_options) + + # Apply stealth features + if self.stealth: await self.context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change navigator properties - const newProto = navigator.__proto__; - delete newProto.webdriver; - - // Overwrite the plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) }); - - // Handle languages more naturally - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - - // Modify deviceMemory - Object.defineProperty(navigator, 'deviceMemory', { - get: () => 8 - }); - - // WebGL modifications - const getParameter = WebGLRenderingContext.prototype.getParameter; - WebGLRenderingContext.prototype.getParameter = function(parameter) { - if (parameter === 37445) { - return 'Intel Inc.'; - } - if (parameter === 37446) { - return 'Intel Iris OpenGL Engine'; - } - return getParameter.apply(this, arguments); - }; - } + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + window.chrome = { runtime: {} }; """) - else: - # Regular context without stealth - self.context = await self.browser.new_context(**context_opts) - # Create page with enhanced headers self.page = await self.context.new_page() - await self.page.set_extra_http_headers({ - 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Cache-Control': 'max-age=0', - 'DNT': '1', # Do Not Track - 'Referer': 'https://www.google.com/', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'cross-site', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1' - }) + return self.page + + async def setup_pyppeteer(self): + """Setup Pyppeteer browser""" + from pyppeteer import launch - # Add delay for mouse movements to simulate human behavior - if self.use_stealth: - await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) - await self.page.wait_for_timeout(random.randint(200, 500)) + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + ] - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - if self.browser: - await self.browser.close() - if self.playwright: - await self.playwright.stop() - - async def rotate_proxy_if_needed(self): - """Rotate proxy if proxy rotation is enabled and threshold is reached""" - if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: - self.request_count += 1 - if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: - # Get next proxy from the pool - next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) - PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list - - # Close existing context and create new one with the new proxy - if self.context: - await self.context.close() - - # Create new context with the new proxy - context_opts = { - "user_agent": get_random_user_agent(), - "proxy": {"server": next_proxy}, - "accept_downloads": True - } - self.context = await self.browser.new_context(**context_opts) - self.page = await self.context.new_page() - - # Reset counter - self.request_count = 0 - logger.info(f"Rotated to new proxy: {next_proxy}") - - async def handle_captcha(self, page): - """Detect and handle captchas if possible""" - # Check for common captcha patterns - content = await page.content() - if detect_captcha(content): - self.captcha_detected = True - logger.warning("Captcha detected on page") - - # Strategies for handling captchas: - # 1. For simple captchas, try to extract the image and solve it - captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') - if captcha_img: - logger.info("Found captcha image, attempting to capture") - - # Take screenshot of the captcha - captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") - await captcha_img.screenshot(path=captcha_path) - - # In a real implementation, you would send this to a captcha solving service - # For now, just log the detection - logger.info(f"Captcha image saved to {captcha_path}") - - # For demonstration, we'll notify the user but not actually solve it - return False - - # 2. For reCAPTCHA, special handling would be required - recaptcha = await page.query_selector('iframe[src*="recaptcha"]') - if recaptcha: - logger.warning("reCAPTCHA detected, would require external solving service") - return False + if self.stealth: + browser_args.extend([ + '--disable-blink-features=AutomationControlled', + '--disable-features=IsolateOrigins' + ]) + + launch_options = { + "headless": True, + "args": browser_args, + "ignoreHTTPSErrors": True, + "userDataDir": tempfile.mkdtemp() + } + + if self.use_proxy and self.proxy: + browser_args.append(f'--proxy-server={self.proxy}') + + self.browser = await launch(launch_options) + self.page = await self.browser.newPage() + + # Set user agent + await self.page.setUserAgent(get_random_user_agent()) + + # Set viewport + await self.page.setViewport({"width": 1920, "height": 1080}) + + # Apply stealth features + if self.stealth: + await self.page.evaluateOnNewDocument(""" + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) + }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + window.chrome = { runtime: {} }; + """) + + return self.page + + async def setup_splash(self): + """Setup Splash browser through API""" + # Splash is typically used via HTTP API + # We'll use requests for this + self.splash_url = "http://localhost:8050/render.html" + return None # No actual page object for Splash + + async def goto(self, url, wait_until=None, timeout=30000): + """Navigate to a URL""" + if self.engine == "playwright": + return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout) + elif self.engine == "pyppeteer": + return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout) + elif self.engine == "splash": + # Use Splash HTTP API + params = { + "url": url, + "wait": min(timeout/1000, 30), # Splash uses seconds + "timeout": min(timeout/1000, 60), + "resource_timeout": min(timeout/1000, 30), + "html": 1, + "png": 0, + "render_all": 1 + } - # 3. Try to perform human-like actions that might bypass simple bot checks - await self.perform_human_actions(page) + if self.use_proxy and self.proxy: + params["proxy"] = self.proxy - # Check if captcha is still present - content = await page.content() - if detect_captcha(content): - logger.warning("Captcha still present after human-like actions") - return False - else: - logger.info("Captcha appears to be resolved") - return True - - return True # No captcha detected + headers = {"User-Agent": get_random_user_agent()} + response = requests.get(self.splash_url, params=params, headers=headers) + self.last_html = response.text + return response + + async def content(self): + """Get page content""" + if self.engine == "playwright": + return await self.page.content() + elif self.engine == "pyppeteer": + return await self.page.content() + elif self.engine == "splash": + return self.last_html + + async def close(self): + """Close browser""" + if self.engine == "playwright": + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + elif self.engine == "pyppeteer": + if self.browser: + await self.browser.close() + # No cleanup needed for Splash as it's stateless - async def perform_human_actions(self, page): - """Perform human-like actions on the page to possibly bypass simple bot checks""" - try: - # 1. Slowly scroll down the page - for i in range(3): - await page.evaluate(f"window.scrollTo(0, {i * 300})") - await page.wait_for_timeout(random.randint(300, 700)) - - # 2. Random mouse movements - for _ in range(3): - x = random.randint(100, 800) - y = random.randint(100, 600) - await page.mouse.move(x=x, y=y) - await page.wait_for_timeout(random.randint(200, 500)) - - # 3. Click on a non-essential part of the page - try: - await page.click("body", position={"x": 50, "y": 50}) - except: - pass - - # 4. Wait a bit before continuing - await page.wait_for_timeout(1000) - - except Exception as e: - logger.warning(f"Error during human-like actions: {e}") +# -------------------- Download Manager Class -------------------- +class DownloadManager: + def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True): + self.browser_engine = browser_engine + self.use_proxy = use_proxy + self.proxy = proxy + self.query = query + self.num_results = num_results + self.use_stealth = use_stealth + self.browser = None + self.network_interceptor = None + + # Configure network interception if enabled + if NETWORK_INTERCEPTOR_CONFIG["enabled"]: + self.network_interceptor = NetworkInterceptor( + intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"], + save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"] + ) - async def search_bing(self): + async def __aenter__(self): + # Initialize multi-engine browser + self.browser = MultiEngineBrowser( + engine=self.browser_engine, + use_proxy=self.use_proxy, + proxy=self.proxy, + stealth=self.use_stealth + ) + self.page = await self.browser.setup() + + # Set headers for better stealth + if self.browser_engine == "playwright": + await self.page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Referer': 'https://www.google.com/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + }) + + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.browser.close() + + async def search_web(self, search_engine="bing"): + """Search web using specified search engine""" urls = [] try: - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - search_url = f"https://www.bing.com/search?q={self.query}" - await self.page.goto(search_url, timeout=30000) - await self.page.wait_for_load_state('networkidle') - - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected during search, results may be limited") - - # More natural scrolling behavior - for i in range(3): - await self.page.evaluate(f"window.scrollTo(0, {i * 400})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Extract search results - links = await self.page.query_selector_all("li.b_algo h2 a") - for link in links[:self.num_results]: - href = await link.get_attribute('href') - if href: - urls.append(href) - - # If we didn't find enough results, try an alternative selector - if len(urls) < self.num_results: - alt_links = await self.page.query_selector_all(".b_caption a") - for link in alt_links: - href = await link.get_attribute('href') - if href and href not in urls: - urls.append(href) - if len(urls) >= self.num_results: - break + if search_engine == "bing": + search_url = f"https://www.bing.com/search?q={self.query}" + elif search_engine == "google": + search_url = f"https://www.google.com/search?q={self.query}" + else: + raise ValueError(f"Unsupported search engine: {search_engine}") + + await self.browser.goto(search_url, timeout=30000) + + if self.browser_engine == "playwright": + if search_engine == "bing": + links = await self.page.query_selector_all("li.b_algo h2 a") + for link in links[:self.num_results]: + href = await link.get_attribute('href') + if href: + urls.append(href) + elif search_engine == "google": + links = await self.page.query_selector_all("div.g a[href^='http']") + for link in links[:self.num_results]: + href = await link.get_attribute('href') + if href: + urls.append(href) + elif self.browser_engine == "pyppeteer": + if search_engine == "bing": + links = await self.page.querySelectorAll("li.b_algo h2 a") + for link in links[:self.num_results]: + href = await self.page.evaluate('el => el.getAttribute("href")', link) + if href: + urls.append(href) + elif search_engine == "google": + links = await self.page.querySelectorAll("div.g a[href^='http']") + for link in links[:self.num_results]: + href = await self.page.evaluate('el => el.getAttribute("href")', link) + if href: + urls.append(href) + elif self.browser_engine == "splash": + # Parse the HTML with BeautifulSoup + soup = BeautifulSoup(self.browser.last_html, 'html.parser') + if search_engine == "bing": + links = soup.select("li.b_algo h2 a") + for link in links[:self.num_results]: + href = link.get("href") + if href: + urls.append(href) + elif search_engine == "google": + links = soup.select("div.g a[href^='http']") + for link in links[:self.num_results]: + href = link.get("href") + if href: + urls.append(href) return urls except Exception as e: - logger.error(f"Error searching Bing: {e}") + logger.error(f"Error searching web: {e}") return [] async def get_file_size(self, url): try: - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - response = await page.request.head(url, timeout=15000) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - else: - return "Unknown Size" - except Exception as e: - logger.warning(f"Error getting file size: {e}") + headers = {'User-Agent': get_random_user_agent()} + response = requests.head(url, headers=headers, timeout=15) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + else: + return "Unknown Size" + except Exception: return "Unknown Size" async def get_pdf_metadata(self, url): try: - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - resp = await page.request.get(url, timeout=15000) - if resp.ok: - content = await resp.body() - pdf = BytesIO(content) - reader = PdfReader(pdf) - return { - 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', - 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', - 'Pages': len(reader.pages), - } - else: - return {} - except Exception as e: - logger.warning(f"Error reading PDF metadata: {e}") + headers = {'User-Agent': get_random_user_agent()} + response = requests.get(url, headers=headers, timeout=15, stream=True) + if response.status_code == 200: + content = BytesIO(response.content) + reader = PdfReader(content) + return { + 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', + 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', + 'Pages': len(reader.pages), + } + else: + return {} + except Exception: return {} async def extract_real_download_url(self, url): try: - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - response = await page.goto(url, wait_until='networkidle', timeout=30000) - if response and response.headers.get('location'): - return response.headers['location'] - return page.url + headers = {'User-Agent': get_random_user_agent()} + response = requests.head(url, headers=headers, timeout=15, allow_redirects=True) + return response.url except Exception as e: logger.error(f"Error extracting real download URL: {e}") return url - # IMPROVED: Enhanced exam links extraction method async def get_edu_exam_links(self, url): """Specialized method for educational exam websites that follows a common pattern.""" try: logger.info(f"Fetching exam links from {url}") links = set() - # First try with direct requests for speed (but with proper headers) - headers = { - "User-Agent": get_random_user_agent(), - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Referer": "https://www.google.com/", - "DNT": "1" - } - + # First try with direct requests for speed + headers = {"User-Agent": get_random_user_agent()} try: response = requests.get(url, headers=headers, timeout=30) if response.status_code == 200: - # Parse with BeautifulSoup first for efficiency + # Parse with BeautifulSoup for efficiency soup = BeautifulSoup(response.text, "html.parser") parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" @@ -609,239 +659,63 @@ class DownloadManager: "view", "open", "get", "solution", "answer" ] - # Check URL for patterns - if any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - continue - - # Check link text for patterns - if any(pattern in link_text for pattern in text_patterns): - links.add(full_url) - continue - - # Check for common file extensions - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + # Check URL and text patterns + if any(pattern in full_url.lower() for pattern in url_patterns) or \ + any(pattern in link_text for pattern in text_patterns) or \ + any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): links.add(full_url) except Exception as e: logger.warning(f"Request-based extraction failed: {e}") - # Browser-based approach for more thorough extraction or if initial approach was inadequate - try: - # Check if we need to proceed with browser-based extraction - if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: - logger.info("Using browser for enhanced link extraction") - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Navigate to the page with more natural timing - await self.page.goto(url, timeout=45000, wait_until='networkidle') - await self.page.wait_for_timeout(random.randint(1000, 2000)) - - # Handle captchas if present - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, extraction may be limited") - - # Get base URL for resolving relative links - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Perform natural scrolling to trigger lazy-loaded content - page_height = await self.page.evaluate("document.body.scrollHeight") - viewport_height = await self.page.evaluate("window.innerHeight") - - for scroll_pos in range(0, page_height, viewport_height // 2): - await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Scroll back to top - await self.page.evaluate("window.scrollTo(0, 0)") - await self.page.wait_for_timeout(500) - - # Extract all links with Playwright (better than just anchor tags) - all_links = await self.page.evaluate(""" - () => { - const results = []; - - // Get all anchor tags - const anchors = document.querySelectorAll('a[href]'); - for (const a of anchors) { - if (a.href) { - results.push({ - href: a.href, - text: a.innerText || a.textContent || '', - isButton: a.classList.contains('btn') || a.role === 'button' - }); - } - } - - // Get buttons that might contain links - const buttons = document.querySelectorAll('button'); - for (const btn of buttons) { - const onclick = btn.getAttribute('onclick') || ''; - if (onclick.includes('window.location') || onclick.includes('download')) { - results.push({ - href: '#button', - text: btn.innerText || btn.textContent || '', - isButton: true, - onclick: onclick - }); - } - } - - return results; - } - """) - - # Process the extracted links - for link_info in all_links: - href = link_info.get('href', '') - text = link_info.get('text', '').lower() - - if href and href != '#button': - # Check URL patterns - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_" - ] - - # Check text patterns - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "pdf", "document", "view", "open", "solution" - ] - - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(pattern in text for pattern in text_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - - # Check for ASP.NET specific elements that might contain exam links - grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') - for grid in grid_elements: - grid_links = await grid.query_selector_all('a[href]') - for a in grid_links: - href = await a.get_attribute('href') - text = await a.text_content() - - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - links.add(full_url) - - # Try clicking pagination controls to reveal more content - pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') - for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons - try: - # Check if this is a numeric pagination button (more likely to be useful) - button_text = await button.text_content() - if button_text and button_text.strip().isdigit(): - logger.info(f"Clicking pagination button: {button_text}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract links from this page - new_page_links = await self.page.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } - """) - - for href in new_page_links: - if href and not href.startswith('javascript:'): - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - except Exception as e: - logger.warning(f"Error clicking pagination button: {e}") - - # Try clicking any controls that might reveal more exam links (more focused approach) - show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') - for button in show_buttons: - button_text = (await button.text_content() or "").lower() - button_value = (await button.get_attribute("value") or "").lower() - button_id = (await button.get_attribute("id") or "").lower() - - # Look for buttons that seem likely to reveal file lists - promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", - "download", "resource", "material", "browse", "file"] - - if any(term in button_text or term in button_value or term in button_id - for term in promising_terms): - try: - logger.info(f"Clicking button: {button_text or button_value}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Get any new links that appeared - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - - # Focus on file extensions and patterns - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ - any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Special handling for ASP.NET PostBack links - try: - # Find and interact with ASP.NET __doPostBack elements - postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') - for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks - try: - onclick = await element.get_attribute('onclick') - if onclick and '__doPostBack' in onclick: - element_text = await element.text_content() - - # Only interact with elements that seem likely to contain exam links - promising_terms = ["show", "view", "list", "exam", "paper", "test", - "download", "resource", "material"] - - if any(term in element_text.lower() for term in promising_terms): - logger.info(f"Clicking ASP.NET postback element: {element_text}") - - # Click the element - await element.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract any new links - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) - except Exception as e: - logger.warning(f"Error interacting with postback element: {e}") - except Exception as e: - logger.warning(f"Error during postback handling: {e}") - - except Exception as e: - logger.error(f"Browser-based extraction failed: {e}") + # Use browser-based approach if needed + if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: + logger.info("Using browser for enhanced link extraction") + + # Navigate to the page + await self.browser.goto(url, timeout=45000) + + # Get page content and parse with BeautifulSoup + content = await self.browser.content() + soup = BeautifulSoup(content, "html.parser") + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Process all links on the page + for a in soup.find_all("a", href=True): + href = a["href"] + full_url = urljoin(url, href) + link_text = a.get_text().lower() + + # Apply the same filtering criteria + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_", + "/resource/", "/subject/", "/course/", "/material/" + ] + + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "subject", "course", "resource", "pdf", "document", + "view", "open", "get", "solution", "answer" + ] + + # Check URL and text patterns + if any(pattern in full_url.lower() for pattern in url_patterns) or \ + any(pattern in link_text for pattern in text_patterns) or \ + any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) - # Filter links to likely contain exam documents + # Filter to likely exam documents filtered_links = [] for link in links: - # Common file extensions for exam documents + # Common file extensions if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): filtered_links.append(link) continue - + # Common paths for exam documents if any(pattern in link.lower() for pattern in [ "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", @@ -860,12 +734,9 @@ class DownloadManager: async def extract_downloadable_files(self, url, custom_ext_list): found_files = [] try: - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - # Special handling for educational exam sites if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): + ["exam", "test", "pastpaper", "eduexp"]): logger.info("Using specialized handler for educational exam site") # Get direct links to exam files @@ -904,102 +775,54 @@ class DownloadManager: 'url': real_url, 'filename': filename, 'size': size_str, - 'metadata': meta + 'metadata': meta, + 'source_url': url # Add source URL for better tracking }) # If we found exam files with the specialized method, return them if found_files: return found_files - # Standard extraction method if specialized method didn't find files - response = await self.page.goto(url, timeout=30000, wait_until='networkidle') - if not response: - return [] + # Standard extraction method for all pages + await self.browser.goto(url, timeout=30000) - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, file extraction may be limited") - - # Scroll through the page naturally to trigger lazy loading - await self.page.evaluate(""" - (async () => { - const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const scrollStep = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += scrollStep) { - window.scrollTo(0, i); - await delay(100); - } - - window.scrollTo(0, 0); - })() - """) - await self.page.wait_for_timeout(1000) - - final_url = self.page.url - if '.php' in final_url or 'download' in final_url: - real_url = await self.extract_real_download_url(final_url) - if real_url != final_url: - # Try to detect the filename from headers or URL - response = await self.page.request.head(real_url, timeout=15000) - filename = None - - # Try to get from Content-Disposition header - content_disposition = response.headers.get('Content-Disposition', '') - if 'filename=' in content_disposition: - filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) - if filename_match: - filename = filename_match.group(1) - - # If not found in headers, use URL basename - if not filename: - filename = os.path.basename(urlparse(real_url).path) - if not filename or filename == '/': - # Generate a name based on domain - domain = get_domain(real_url) - ext = get_file_extension(real_url, '.pdf') - filename = f"file_from_{domain}{ext}" - - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': await self.get_file_size(real_url), - 'metadata': {} - }) - return found_files - - await self.page.wait_for_load_state('networkidle', timeout=30000) - content = await self.page.content() + # Get page content + content = await self.browser.content() soup = BeautifulSoup(content, 'html.parser') + # Define file extensions to look for default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', - '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', - '.pptx', '.odt', '.txt'] + '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', + '.pptx', '.odt', '.txt'] all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) - parsed_base = urlparse(final_url) + # Get base URL for resolving relative links + parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" path_base = os.path.dirname(parsed_base.path) - # Process all anchor tags + # Process all anchor tags for file links for a in soup.find_all('a', href=True): href = a['href'].strip() + # Handle PHP and download links separately if '.php' in href.lower() or 'download' in href.lower(): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + full_url = href if href.startswith('http') else urljoin(base_url, href) real_url = await self.extract_real_download_url(full_url) if real_url and real_url != full_url: + filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file' found_files.append({ 'url': real_url, - 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', + 'filename': filename, 'size': await self.get_file_size(real_url), - 'metadata': {} + 'metadata': {}, + 'source_url': url }) continue - + + # Check for direct file extensions if any(href.lower().endswith(ext) for ext in all_exts): - file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + file_url = href if href.startswith('http') else urljoin(base_url, href) size_str = await self.get_file_size(file_url) meta = {} if file_url.lower().endswith('.pdf'): @@ -1008,9 +831,10 @@ class DownloadManager: 'url': file_url, 'filename': os.path.basename(file_url.split('?')[0]), 'size': size_str, - 'metadata': meta + 'metadata': meta, + 'source_url': url }) - + # Handle Google Drive links elif ("drive.google.com" in href) or ("docs.google.com" in href): file_id = None @@ -1019,299 +843,131 @@ class DownloadManager: if match: file_id = match.group(1) break + if file_id: - # Get file info to determine type and view-only status - file_type, is_view_only = await self.get_google_drive_file_info(file_id) + # Determine if it's a view-only file + is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")) - # Create a more informative filename based on info filename = f"gdrive_{file_id}" - if file_type: - filename = f"{filename}.{file_type}" - - size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") + ext = get_file_extension(href, '.pdf') + if ext != '.': + filename += ext found_files.append({ - 'url': href, # Use original URL + 'url': href, 'filename': filename, - 'size': size_str, + 'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"), 'metadata': { 'view_only': is_view_only, - 'file_type': file_type, 'file_id': file_id - } + }, + 'source_url': url }) - # Also check for files in other elements (iframe, embed, object, etc.) - other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) - for elem in other_elements: - src = elem.get('src') or elem.get('data') - if src and any(src.lower().endswith(ext) for ext in all_exts): - file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - size_str = await self.get_file_size(file_url) - meta = {} - if file_url.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(file_url) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': size_str, - 'metadata': meta - }) - - # Check for file links in onclick attributes - onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') - for elem in onclick_elements: - onclick = await elem.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url_match in urls: - if any(url_match.lower().endswith(ext) for ext in all_exts): - size_str = await self.get_file_size(url_match) - meta = {} - if url_match.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(url_match) - found_files.append({ - 'url': url_match, - 'filename': os.path.basename(url_match.split('?')[0]), - 'size': size_str, - 'metadata': meta - }) - - # Also check for data-src and data-url attributes (common in lazy-loaded sites) - data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') - for elem in data_elements: - for attr in ['data-src', 'data-url', 'data-href', 'data-download']: - try: - value = await elem.get_attribute(attr) - if value and any(value.lower().endswith(ext) for ext in all_exts): - file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': await self.get_file_size(file_url), - 'metadata': {} - }) - except: - pass - - # Check script tags for JSON data that might contain file URLs - script_elements = soup.find_all('script', type='application/json') - for script in script_elements: - try: - json_data = json.loads(script.string) - # Look for URL patterns in the JSON data - def extract_urls_from_json(obj, urls_found=None): - if urls_found is None: - urls_found = [] - if isinstance(obj, dict): - for k, v in obj.items(): - # Check if any key contains url-like terms - url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] - if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): - urls_found.append(v) - else: - extract_urls_from_json(v, urls_found) - elif isinstance(obj, list): - for item in obj: - extract_urls_from_json(item, urls_found) - return urls_found - - json_urls = extract_urls_from_json(json_data) - for json_url in json_urls: - if any(json_url.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': json_url, - 'filename': os.path.basename(json_url.split('?')[0]), - 'size': await self.get_file_size(json_url), - 'metadata': {} - }) - except: - pass - - # Check for hidden download buttons or forms - hidden_elements = await self.page.evaluate(""" - () => { - const results = []; - - // Check for hidden forms with download actions - const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); - for (const form of forms) { - const action = form.getAttribute('action') || ''; - results.push({ - type: 'form', - action: action, - inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { - return {name: input.name, value: input.value}; - }) - }); - } - - // Check for hidden download links/buttons - const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { - const style = window.getComputedStyle(a); - return (style.display === 'none' || style.visibility === 'hidden') && - (a.href.includes('download') || a.href.includes('file')); - }); - - for (const link of hiddenLinks) { - results.push({ - type: 'link', - href: link.href, - text: link.innerText || link.textContent - }); - } - - return results; - } - """) - - # Process hidden elements - for elem in hidden_elements: - if elem['type'] == 'link' and 'href' in elem: - href = elem['href'] - if any(href.lower().endswith(ext) for ext in all_exts): + # Check for embedded content (iframe, embed, object) + for elem_tag in ['iframe', 'embed', 'object', 'source']: + for elem in soup.find_all(elem_tag): + src = elem.get('src') or elem.get('data') + if src and any(src.lower().endswith(ext) for ext in all_exts): + file_url = src if src.startswith('http') else urljoin(base_url, src) found_files.append({ - 'url': href, - 'filename': os.path.basename(href.split('?')[0]), - 'size': await self.get_file_size(href), - 'metadata': {} + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': await self.get_file_size(file_url), + 'metadata': {}, + 'source_url': url }) - # Deduplicate files by URL + # Deduplicate files seen_urls = set() unique_files = [] for f in found_files: if f['url'] not in seen_urls: seen_urls.add(f['url']) unique_files.append(f) - + return unique_files + except Exception as e: logger.error(f"Error extracting files from {url}: {e}") - traceback.print_exc() return [] - async def download_file(self, file_info, save_dir, referer): + async def download_file(self, file_info, save_dir, referer=None): + """Download a file and provide a direct download link""" file_url = file_info['url'] fname = file_info['filename'] + referer = referer or file_info.get('source_url', 'https://www.google.com') + + # Create unique filename to avoid overwriting path = os.path.join(save_dir, fname) base, ext = os.path.splitext(fname) counter = 1 while os.path.exists(path): path = os.path.join(save_dir, f"{base}_{counter}{ext}") counter += 1 + os.makedirs(save_dir, exist_ok=True) try: # Special handling for Google Drive files if "drive.google.com" in file_url or "docs.google.com" in file_url: - # Check if it's marked as view-only in metadata + # For view-only Google Drive files, use specialized method is_view_only = file_info.get('metadata', {}).get('view_only', False) - - # For view-only files, try our most robust approach first if is_view_only: - logger.info(f"Attempting to download view-only file: {file_url}") - result_path = await self.force_download_viewonly(file_info, path) + result_path = await self.download_viewonly_google_drive(file_info, path) if result_path: return result_path - - # If that failed, try the regular download approach - logger.info("Primary method failed, trying fallback methods") - # Try regular download methods - success = await self.download_from_google_drive(file_url, path) - if success: - return path - - # If all methods failed for Google Drive, try one last approach - logger.warning("All standard methods failed, attempting force download") - result_path = await self.force_download_viewonly(file_info, path) - return result_path if result_path else None - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Try with direct requests first (faster) - try: - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer, - 'DNT': '1' - } + # Try standard Google Drive download + file_id = None + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, file_url) + if match: + file_id = match.group(1) + break - with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: - if response.status_code == 200: - # Check content type to verify it's not HTML/error page - content_type = response.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_url.endswith('.html'): - logger.warning(f"Received HTML instead of expected file: {file_url}") - else: - with open(path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file was downloaded correctly - if os.path.exists(path) and os.path.getsize(path) > 0: - return path - except Exception as e: - logger.warning(f"Direct download failed: {e}, trying browser approach") + if file_id: + # Try direct download + download_url = f"https://drive.google.com/uc?id={file_id}&export=download" + headers = { + 'User-Agent': get_random_user_agent(), + 'Referer': referer + } - # Original code for non-Google Drive downloads using Playwright - async with self.context.new_page() as page: - headers = { - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer - } - - # Try to download with timeout protection - try: - response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) - if response.status == 200: - content = await response.body() + with requests.get(download_url, headers=headers, stream=True) as r: + r.raise_for_status() with open(path, 'wb') as f: - f.write(content) - return path - else: - logger.error(f"Download failed with status {response.status}: {file_url}") - - # Try to extract error information - error_info = await response.text() - logger.debug(f"Error response: {error_info[:200]}...") - - # Check if this might be a captcha or login issue - if detect_captcha(error_info): - logger.warning("Captcha detected during download") - # For HF Spaces, we can't implement browser-based captcha solving here - # Just log the issue for now - except PlaywrightTimeoutError: - logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") - - # Try an alternative approach - using the browser's download manager - try: - logger.info("Trying browser download manager approach") - download_promise = page.wait_for_event("download") - await page.goto(file_url, timeout=60000) - - # Wait for download to start (with timeout) - download = await download_promise - await download.save_as(path) + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) if os.path.exists(path) and os.path.getsize(path) > 0: return path - except Exception as e: - logger.error(f"Browser download manager approach failed: {e}") - + + # Standard file download + headers = { + 'User-Agent': get_random_user_agent(), + 'Referer': referer, + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br' + } + + with requests.get(file_url, headers=headers, stream=True) as r: + r.raise_for_status() + with open(path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + if os.path.exists(path) and os.path.getsize(path) > 0: + return path + else: return None + except Exception as e: logger.error(f"Error downloading {file_url}: {e}") return None - # IMPROVED: Enhanced view-only document download method - async def force_download_viewonly(self, file_info, save_path): - """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" + async def download_viewonly_google_drive(self, file_info, save_path): + """Download view-only Google Drive documents""" try: # Extract file ID file_id = file_info.get('metadata', {}).get('file_id') @@ -1327,1353 +983,350 @@ class DownloadManager: logger.error("Could not extract file ID") return None - file_type = file_info.get('metadata', {}).get('file_type', 'pdf') + # Determine file type + file_type = get_file_extension(file_info['url'], '.pdf').lstrip('.') + + # Ensure appropriate extension on save path base, ext = os.path.splitext(save_path) if not ext: save_path = f"{base}.{file_type}" - logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") - - # Create a dedicated browser instance with better resolution and stealth - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-site-isolation-trials', - '--disable-blink-features=AutomationControlled' # Anti-detection - ] - - browser = await self.playwright.chromium.launch( - headless=True, - args=browser_args - ) - - # Use higher resolution for better quality - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - device_scale_factor=2.0, - accept_downloads=True # Critical for the download workflow - ) - - # Add anti-detection script - await context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - // Handle languages - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - } - """) - - page = await context.new_page() - - try: - # Go to the file view page - logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) - await page.wait_for_load_state('networkidle') - - # Check for any barriers or permissions issues - content = await page.content() - if "the owner has not granted you permission to" in content: - logger.warning("Permission denied error detected") - - # Randomized wait to appear more human-like - await page.wait_for_timeout(random.randint(3000, 7000)) - - # Create temp directory - temp_dir = tempfile.mkdtemp() - - # Special handling for PDFs - if file_type.lower() == 'pdf': - # Use the improved scrolling and detection approach + logger.info(f"Downloading view-only Google Drive file: {file_id}") + + # Create a dedicated browser session + if self.browser_engine == "playwright": + from playwright.async_api import async_playwright + + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-blink-features=AutomationControlled' + ] + ) - # Perform some natural mouse movements and scrolling - await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) - await page.wait_for_timeout(random.randint(500, 1000)) + # Create context with options for better handling + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent=get_random_user_agent(), + accept_downloads=True, + ignore_https_errors=True + ) - # Estimate number of pages - estimated_pages = await page.evaluate(""" - () => { - // Method 1: Check page counter text - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); + # Add stealth script + await context.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // Method 2: Check actual page elements - const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pageElements.length > 0) return pageElements.length; - - // Method 3: Look for page thumbnails - const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); - if (thumbnails.length > 0) return thumbnails.length; - - // Fallback: conservative guess - return 50; - } + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + window.chrome = { runtime: {} }; """) - logger.info(f"Estimated {estimated_pages} pages in PDF") - - # Initial scroll to trigger lazy loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Scroll page by page to ensure all pages are loaded - logger.info("Scrolling page by page...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 + page = await context.new_page() - while attempt < max_attempts: - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) + try: + # Visit the file + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) + await page.wait_for_load_state('networkidle') - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + # Wait for content to load + await page.wait_for_timeout(5000) - if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): - logger.info("All pages appear to be loaded.") - break + # Create temporary directory for processing + temp_dir = tempfile.mkdtemp() - # Alternate between PageDown and End keys for more natural scrolling - if attempt % 3 == 0: - await page.keyboard.press("End") - else: - await page.keyboard.press("PageDown") + # For PDF handling + if file_type == 'pdf': + # Create directory for screenshots + screenshots_dir = os.path.join(temp_dir, "screenshots") + os.makedirs(screenshots_dir, exist_ok=True) - # Randomized wait times - await page.wait_for_timeout(random.randint(1500, 3000)) - - # Move mouse randomly to appear more human-like - if attempt % 4 == 0: - await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is loaded - await page.wait_for_timeout(5000) - - # Set up download event listener for the PDF - download_promise = page.wait_for_event("download") - - # Use jsPDF to generate PDF from loaded pages - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = Array.from(document.getElementsByTagName("img")) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); - - console.log(`Found ${imgs.length} valid page images to add to PDF`); - - let added = 0; - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; + # Get page count + total_pages = await page.evaluate(""" + () => { + // Look for page counters in the interface + const pageCounters = document.querySelectorAll('*'); + for (const el of pageCounters) { + const text = el.textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) { + return parseInt(match[2]); } - - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success', False): - logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") - - # Try fallback approach - screenshot method - logger.info("Trying fallback screenshot method...") - - # Navigate back to the first page - await page.evaluate(""" - () => { - // Find and click the "first page" button if available - const buttons = Array.from(document.querySelectorAll('button')); - const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); - if (firstPageBtn) firstPageBtn.click(); - } - """) - await page.wait_for_timeout(1000); - - # Create a PDF by taking screenshots of each page - screenshots = [] - current_page = 1 - max_pages = estimated_pages - - # Create a PDF using the reportlab package - while current_page <= max_pages: - screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") - - # Try to find the current page element - page_elem = await page.query_selector('.drive-viewer-paginated-page') - if page_elem: - await page_elem.screenshot(path=screenshot_path) - else: - # Fallback to full page screenshot - await page.screenshot(path=screenshot_path) - - screenshots.append(screenshot_path) - - # Try to navigate to next page - next_btn = await page.query_selector('button[aria-label="Next page"]') - if next_btn: - is_disabled = await next_btn.get_attribute('disabled') - if is_disabled: - logger.info(f"Reached end of document at page {current_page}") - break - - await next_btn.click() - await page.wait_for_timeout(1000) - current_page += 1 - else: - break - - # Create PDF from screenshots - if screenshots: - first_img = Image.open(screenshots[0]) - width, height = first_img.size + + // Look for paginated pages + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length > 0) return pages.length; + + // Default if we can't determine + return 20; + } + """) - c = canvas.Canvas(save_path, pagesize=(width, height)) - for screenshot in screenshots: - img = Image.open(screenshot) - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - c.save() + logger.info(f"PDF has approximately {total_pages} pages") - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) + # Take screenshots of each page + screenshots = [] - return save_path - - return None - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Wait for the download and save it - download = await download_promise - await download.save_as(save_path) - - # Clean up temp directory - try: - os.rmdir(temp_dir) - except: - pass - - else: - # Non-PDF file handling - screenshot_path = os.path.join(temp_dir, "file.png") - await page.screenshot(path=screenshot_path) - - if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: - # For document types, try to export directly - await self.export_google_doc(file_id, file_type, save_path) - else: - # For other types, save the screenshot with appropriate extension - shutil.copy(screenshot_path, save_path) - - os.remove(screenshot_path) - - # Close browser - await browser.close() - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: - logger.info(f"Successfully downloaded file to {save_path}") - return save_path - else: - logger.error(f"Generated file is too small or missing: {save_path}") - return None - - except Exception as e: - logger.error(f"Error during force download: {e}") - if browser: - await browser.close() - return None - - except Exception as e: - logger.error(f"Force download preparation failed: {e}") - return None - - async def download_from_google_drive(self, url, save_path): - """Enhanced method to download from Google Drive with multiple fallback approaches""" - # Extract the file ID from different URL formats - file_id = None - url_patterns = [ - r'drive\.google\.com/file/d/([^/]+)', - r'drive\.google\.com/open\?id=([^&]+)', - r'docs\.google\.com/\w+/d/([^/]+)', - r'id=([^&]+)', - r'drive\.google\.com/uc\?id=([^&]+)', - ] - - for pattern in url_patterns: - match = re.search(pattern, url) - if match: - file_id = match.group(1) - break - - if not file_id: - logger.error(f"Could not extract file ID from URL: {url}") - return False - - # Determine file type first (important for handling different file types) - file_type, is_view_only = await self.get_google_drive_file_info(file_id) - logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") - - base, ext = os.path.splitext(save_path) - if not ext and file_type: - # Add the correct extension if missing - save_path = f"{base}.{file_type}" - - # For view-only files, use specialized approaches - if is_view_only: - # Approach 1: For PDFs, use the JS method - if file_type == 'pdf': - success = await self.download_viewonly_pdf_with_js(file_id, save_path) - if success: - return True - - # Approach 2: For Google Docs, Sheets, etc., use export API - if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: - success = await self.export_google_doc(file_id, file_type, save_path) - if success: - return True - - # Approach 3: Try the direct screenshot method for any view-only file - success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) - if success: - return True - - # Try standard approaches for non-view-only files - try: - # Try direct download link first (fastest) - direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" - - # Add anti-bot headers - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://drive.google.com/', - 'DNT': '1' - } - - # Try with streaming to handle larger files - with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: - if r.status_code == 200: - # Check if we got HTML instead of the file - content_type = r.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_id.endswith('.html'): - logger.warning("Received HTML instead of file, trying with session cookies") - else: - # Looks like we got the actual file - with open(save_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 0: - logger.info("Direct download successful") - return True - - # Try with requests and session cookies - session = requests.Session() - session.headers.update({'User-Agent': get_random_user_agent()}) - - # Visit the page first to get cookies - session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) - - # Try download - url = f"https://drive.google.com/uc?id={file_id}&export=download" - response = session.get(url, stream=True, timeout=30) - - # Check for confirmation token - confirmation_token = None - for k, v in response.cookies.items(): - if k.startswith('download_warning'): - confirmation_token = v - break - - # Use confirmation token if found - if confirmation_token: - url = f"{url}&confirm={confirmation_token}" - response = session.get(url, stream=True, timeout=60) - - # Check if we're getting HTML instead of the file - content_type = response.headers.get('Content-Type', '') - if 'text/html' in content_type: - logger.warning("Received HTML instead of file - likely download restriction") - else: - with open(save_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=1024*1024): - if chunk: - f.write(chunk) - - if os.path.exists(save_path) and os.path.getsize(save_path) > 0: - with open(save_path, 'rb') as f: - content = f.read(100) - if b'' not in content: - logger.info("Successfully downloaded with requests session") - return True - except Exception as e: - logger.warning(f"Requests session download failed: {e}") - - # Try browser-based approach as last resort - try: - async with self.context.new_page() as page: - # Visit the file view page first to get cookies - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - await page.wait_for_timeout(3000) - - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Try to trigger the download button click - download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') - if download_button: - await download_button.click() - - # Wait for download to start - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except Exception as e: - logger.error(f"Error during browser download: {e}") - return False - else: - # Try the export download URL - await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) - - # Look for and click any download buttons or links - download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') - for elem in download_elements: - try: - await elem.click() - # Wait a bit to see if download starts - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except: - pass - except: - continue - except Exception as e: - logger.error(f"Browser-based download attempt failed: {e}") - - logger.warning("All standard download methods failed") - return False - - async def download_viewonly_pdf_with_js(self, file_id, save_path): - """Download view-only PDF using the enhanced blob image caching technique""" - try: - # Create a dedicated browser instance with stealth capabilities - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-blink-features=AutomationControlled' # Anti-detection - ] - - browser = await self.playwright.chromium.launch( - headless=True, - args=browser_args - ) - - # Setup stealth context - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - accept_downloads=True, # Critical for handling the download event - ignore_https_errors=True - ) - - # Add stealth script - await context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change plugins and languages to appear more human - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - } - """) - - page = await context.new_page() - - try: - # Step 1: Navigate to the file with human-like behavior - logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) - await page.wait_for_load_state('networkidle') - - # Perform human-like interactions - await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) - await page.wait_for_timeout(random.randint(2000, 5000)) - - # Step 2: Estimate the number of pages - estimated_pages = await page.evaluate(""" - () => { - // Look for page counter in the interface - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); - }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // If we can't find a counter, check actual pages - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length > 0) return pages.length; - - // Default to a reasonable number if we can't determine - return 50; - } - """) - - logger.info(f"Estimated number of pages: {estimated_pages}") - - # Step 3: Initial scroll to trigger loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Step 4: Wait for all pages to load with better feedback and randomization - logger.info("Scrolling through document to load all pages...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 - consecutive_same_count = 0 - - while attempt < max_attempts: - # Count blob images (which are the PDF pages) - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) - - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - - # Check if we've loaded all pages or if we're stuck - if blob_count >= estimated_pages: - logger.info(f"All {estimated_pages} pages appear to be loaded.") - break - - if blob_count == prev_blob_count: - consecutive_same_count += 1 - if consecutive_same_count >= 5 and blob_count > 0: - logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") - break - else: - consecutive_same_count = 0 - - # Mix up the scrolling approach for more human-like behavior - scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) - - if scroll_action == "PageDown": - await page.keyboard.press("PageDown") - elif scroll_action == "End": - await page.keyboard.press("End") - elif scroll_action == "ArrowDown": - # Press arrow down multiple times - for _ in range(random.randint(5, 15)): - await page.keyboard.press("ArrowDown") - await page.wait_for_timeout(random.randint(50, 150)) - else: # mouse - # Scroll using mouse wheel - current_y = random.randint(300, 700) - await page.mouse.move(x=random.randint(300, 800), y=current_y) - await page.mouse.wheel(0, random.randint(300, 800)) - - # Random wait between scrolls - await page.wait_for_timeout(random.randint(1000, 3000)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is fully loaded - await page.wait_for_timeout(5000) - - # Step 5: Set up a download event listener - download_promise = page.wait_for_event("download") - - # Step 6: Inject the jsPDF script to generate PDF - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = document.getElementsByTagName("img"); - let validImages = []; + # First try with the page element method + for i in range(min(total_pages, 100)): # Limit to 100 pages for safety + try: + # Navigate to specific page + if i > 0: + await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()") + await page.wait_for_timeout(500) - // First collect all valid blob images - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - if (!/^blob:/.test(img.src)) continue; - if (img.width < 100 || img.height < 100) continue; - validImages.push(img); - } + # Wait for the page to render + await page.wait_for_timeout(500) - // Sort by position in the document - validImages.sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); + # Take screenshot + screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") - console.log(`Found ${validImages.length} valid page images to add to PDF`); + # Try to find the page element + page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})') + if page_element: + await page_element.screenshot(path=screenshot_path) + else: + # Fallback to viewport screenshot + await page.screenshot(path=screenshot_path) - let added = 0; - // Process each image as a page - for (let i = 0; i < validImages.length; i++) { - let img = validImages[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; - } + screenshots.append(screenshot_path) - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); - } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - // Use a reliable CDN - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success'): - logger.error(f"Error in PDF generation: {result.get('error')}") - return False - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Step 7: Wait for the download to complete and save the file - download = await download_promise - - # Step 8: Save the downloaded file to the specified path - await download.save_as(save_path) - logger.info(f"Successfully saved PDF to {save_path}") - - return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 - - finally: - await browser.close() - - except Exception as e: - logger.error(f"Error in viewonly PDF download process: {e}") - return False - - async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): - """Download any view-only file by taking screenshots""" - try: - async with self.context.new_page() as page: - # Set high-resolution viewport - await page.set_viewport_size({"width": 1600, "height": 1200}) - - # Navigate to the file - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) - - # Make sure the file is loaded - await page.wait_for_load_state('networkidle') - await page.wait_for_timeout(3000) # Extra time for rendering - - # Create directory for screenshots if multiple pages - base_dir = os.path.dirname(save_path) - base_name = os.path.splitext(os.path.basename(save_path))[0] - screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") - os.makedirs(screenshots_dir, exist_ok=True) - - # Check if it's a multi-page document - is_multi_page = await page.evaluate(""" - () => { - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - return pages.length > 1; - } - """) - - if is_multi_page and file_type == 'pdf': - # For multi-page PDFs, take screenshots of each page - page_count = await page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - const container = document.querySelector('.drive-viewer-paginated-scrollable'); - - if (!container || pages.length === 0) return 0; - - // Scroll through to make sure all pages are loaded - const scrollHeight = container.scrollHeight; - const viewportHeight = container.clientHeight; - const scrollStep = viewportHeight; - - for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { - container.scrollTo(0, scrollPos); - await delay(300); - } - - // Scroll back to top - container.scrollTo(0, 0); - await delay(300); + # Check if we should continue to next page + if i < total_pages - 1: + next_button = await page.query_selector('button[aria-label="Next page"]') + if next_button: + # Check if button is disabled + is_disabled = await next_button.get_attribute('disabled') + if is_disabled: + logger.info(f"Reached last page at page {i+1}") + break + + # Click next page + await next_button.click() + await page.wait_for_timeout(1000) + else: + logger.info("Next page button not found") + break + except Exception as e: + logger.error(f"Error capturing page {i+1}: {e}") + continue - return pages.length; - } - """) - - logger.info(f"Found {page_count} pages in document") - - # Take screenshots of each page - screenshots = [] - for i in range(page_count): - # Scroll to page - await page.evaluate(f""" - async () => {{ - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length <= {i}) return false; - - pages[{i}].scrollIntoView(); - await delay(500); - return true; - }} - """) - - # Take screenshot - screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") - await page.screenshot(path=screenshot_path, clip={ - 'x': 0, - 'y': 0, - 'width': 1600, - 'height': 1200 - }) - screenshots.append(screenshot_path) - - # Combine screenshots into PDF - c = canvas.Canvas(save_path) - for screenshot in screenshots: - img = Image.open(screenshot) - width, height = img.size - - # Add page to PDF - c.setPageSize((width, height)) - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - - c.save() - - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) - os.rmdir(screenshots_dir) - - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - else: - # For single-page or non-PDF files, just take one screenshot - screenshot_path = os.path.join(screenshots_dir, "screenshot.png") - await page.screenshot(path=screenshot_path, fullPage=True) - - # Convert to requested format if needed - if file_type == 'pdf': - # Create PDF from screenshot - img = Image.open(screenshot_path) - width, height = img.size - - c = canvas.Canvas(save_path, pagesize=(width, height)) - c.drawImage(screenshot_path, 0, 0, width, height) - c.save() - else: - # Just copy the screenshot to the destination with proper extension - shutil.copy(screenshot_path, save_path) - - # Clean up - os.remove(screenshot_path) - os.rmdir(screenshots_dir) - - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - - except Exception as e: - logger.error(f"Error taking screenshots: {e}") - return False - - async def export_google_doc(self, file_id, file_type, save_path): - """Export Google Docs/Sheets/Slides to downloadable formats""" - try: - # Map file types to export formats - export_formats = { - 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx - 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx - 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx - 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'pdf': 'application/pdf', - } - - export_format = export_formats.get(file_type, 'application/pdf') - export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" - - if 'sheet' in file_type or 'xlsx' in file_type: - export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" - elif 'ppt' in file_type or 'presentation' in file_type: - export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" - elif file_type == 'pdf': - export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" - - async with self.context.new_page() as page: - # Get cookies from the main view page first - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') - - # Now try the export - response = await page.goto(export_url, wait_until='networkidle') - - if response.status == 200: - content = await response.body() - with open(save_path, 'wb') as f: - f.write(content) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - else: - logger.warning(f"Export failed with status {response.status}") - return False - - except Exception as e: - logger.error(f"Error exporting Google Doc: {e}") - return False - - async def get_google_drive_file_info(self, file_id): - """Get file type and view-only status from Google Drive""" - file_type = None - is_view_only = False - - try: - async with self.context.new_page() as page: - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - - # Check if view-only - view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') - is_view_only = view_only_text is not None - - # Check for Google Docs viewer - gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') - gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') - gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') - - if gdocs_viewer: - file_type = 'docx' - elif gsheets_viewer: - file_type = 'xlsx' - elif gslides_viewer: - file_type = 'pptx' - else: - # Check for PDF viewer - pdf_viewer = await page.query_selector('embed[type="application/pdf"]') - if pdf_viewer: - file_type = 'pdf' - else: - # Check for image viewer - img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') - if img_viewer: - # Get image type from src - img_src = await img_viewer.get_attribute('src') - if 'jpg' in img_src or 'jpeg' in img_src: - file_type = 'jpg' - elif 'png' in img_src: - file_type = 'png' + # Create PDF from screenshots + if screenshots: + # Get dimensions from first screenshot + first_img = Image.open(screenshots[0]) + width, height = first_img.size + + # Create PDF + c = canvas.Canvas(save_path, pagesize=(width, height)) + for screenshot in screenshots: + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + c.save() + + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) + + # Clean up temp directory + shutil.rmtree(temp_dir, ignore_errors=True) + + return save_path else: - file_type = 'jpg' # Default to jpg + logger.error("No screenshots captured") else: - # Generic file type fallback - file_type = 'pdf' # Default to PDF - - # If still no type, check filename - if not file_type: - title_element = await page.query_selector('div[role="heading"]') - if title_element: - title = await title_element.text_content() - if title: - ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) - if ext_match: - file_type = ext_match.group(1).lower() - + # For non-PDF files, just take a screenshot + screenshot_path = os.path.join(temp_dir, "file.png") + await page.screenshot(path=screenshot_path) + + # Copy to destination + shutil.copy(screenshot_path, save_path) + + # Clean up + os.remove(screenshot_path) + shutil.rmtree(temp_dir, ignore_errors=True) + + return save_path + finally: + await browser.close() + elif self.browser_engine == "pyppeteer": + # Similar implementation for Pyppeteer + pass + + return None except Exception as e: - logger.error(f"Error getting Google Drive file info: {e}") - file_type = 'pdf' # Default to PDF if we can't determine - - return file_type, is_view_only + logger.error(f"Error downloading view-only file: {e}") + return None - # IMPROVED: Enhanced sublink extraction method async def get_sublinks(self, url, limit=10000): - """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" + """Extract all sublinks from a website""" links = set() try: - logger.info(f"Fetching sublinks from: {url}") + logger.info(f"Extracting sublinks from {url}") - # Special handling for educational sites like phsms.cloud.ncnu.edu.tw + # Special handling for educational sites if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): - logger.info("Using specialized exam site sublink extraction") + ["exam", "test", "pastpaper", "eduexp"]): edu_links = await self.get_edu_exam_links(url) for link in edu_links: links.add(link) - # If we found a good number of links with the specialized method, return them if len(links) > 5: logger.info(f"Found {len(links)} sublinks with specialized method") return list(links)[:limit] - # Rotate proxy if needed - await self.rotate_proxy_if_needed() + # Standard link extraction for all sites + await self.browser.goto(url, timeout=30000) - # Standard sublink extraction for all sites - await self.page.goto(url, timeout=30000, wait_until='networkidle') + # Get page content + content = await self.browser.content() + soup = BeautifulSoup(content, 'html.parser') # Get base URL for resolving relative links parsed_base = urlparse(url) base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - path_base = os.path.dirname(parsed_base.path) - # Perform initial scrolling to load lazy content - await self.page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const step = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += step) { - window.scrollTo(0, i); - await delay(150); - } + # Extract all links from the page + for a in soup.find_all('a', href=True): + href = a['href'] + if href and not href.startswith('javascript:') and not href.startswith('#'): + # Resolve relative URLs + if href.startswith('/'): + full_url = f"{base_url}{href}" + elif href.startswith('http'): + full_url = href + else: + full_url = urljoin(url, href) - window.scrollTo(0, 0); - } - """) - await self.page.wait_for_timeout(1000) + links.add(full_url) - # Check if page has ASP.NET elements which might need special handling - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') + # Extract iframe sources + for iframe in soup.find_all('iframe', src=True): + src = iframe['src'] + if src and not src.startswith('javascript:') and not src.startswith('about:'): + full_url = src if src.startswith('http') else urljoin(url, src) + links.add(full_url) - if is_aspnet: - logger.info("Detected ASP.NET page, using enhanced extraction method") - - # Try to interact with ASP.NET controls that might reveal more links - # Look for dropdowns, buttons, and grid elements - dropdowns = await self.page.query_selector_all('select') - buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') - - # Try interacting with dropdowns first - for dropdown in dropdowns: - try: - # Get all options - options = await self.page.evaluate(''' - (dropdown) => { - return Array.from(dropdown.options).map(o => o.value); - } - ''', dropdown) - - # Try selecting each option - for option in options: - if option: - await dropdown.select_option(value=option) - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error interacting with dropdown: {e}") - - # Try clicking buttons (but avoid dangerous ones like "delete") - safe_buttons = [] - for button in buttons: - button_text = await button.text_content() or "" - button_value = await button.get_attribute("value") or "" - button_id = await button.get_attribute("id") or "" - combined_text = (button_text + button_value + button_id).lower() - - # Skip potentially destructive buttons - if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): - continue - - # Prioritize buttons that might show more content - if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): - safe_buttons.append(button) + return list(links)[:limit] + except Exception as e: + logger.error(f"Error extracting sublinks: {e}") + return list(links)[:limit] + + @celery_app.task + def download_file_task(file_info, save_dir, referer=None): + """Celery task for downloading files asynchronously""" + # This function runs in a separate worker process + file_url = file_info['url'] + fname = file_info['filename'] + referer = referer or file_info.get('source_url', 'https://www.google.com') + + # Create unique filename + path = os.path.join(save_dir, fname) + base, ext = os.path.splitext(fname) + counter = 1 + while os.path.exists(path): + path = os.path.join(save_dir, f"{base}_{counter}{ext}") + counter += 1 + + os.makedirs(save_dir, exist_ok=True) + + try: + # Handle Google Drive files + if "drive.google.com" in file_url or "docs.google.com" in file_url: + # Extract file ID + file_id = None + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, file_url) + if match: + file_id = match.group(1) + break - # Click the safe buttons - for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks - try: - await button.click() - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Extract links from the initial page state - await self.extract_all_link_types(links, base_url, path_base) - - # Look specifically for links inside grid/table views which are common in ASP.NET applications - grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') - for cell in grid_cells: - try: - href = await cell.get_attribute('href') - if href: - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links.add(full_url) - except Exception as e: - logger.warning(f"Error extracting grid link: {e}") - - # Extract links from onclick attributes and javascript:__doPostBack calls - postback_links = await self.page.evaluate(''' - () => { - const results = []; - // Find elements with onclick containing __doPostBack - const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); - for (const el of elements) { - // Extract the postback target - const onclick = el.getAttribute('onclick') || ''; - const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); - if (match && match[1]) { - // Get the visible text to use as description - const text = el.innerText || el.textContent || 'Link'; - results.push({ - id: match[1], - text: text.trim() - }); - } + if file_id: + # Try direct download + download_url = f"https://drive.google.com/uc?id={file_id}&export=download" + headers = { + 'User-Agent': get_random_user_agent(), + 'Referer': referer } - return results; - } - ''') - - # Try interacting with some of the postback links - for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions - try: - logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") - await self.page.evaluate(f''' - () => {{ - if (typeof __doPostBack === 'function') {{ - __doPostBack('{postback["id"]}', ''); - }} - }} - ''') - await self.page.wait_for_timeout(1500) - await self.page.wait_for_load_state('networkidle', timeout=5000) - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error with postback: {e}") + with requests.get(download_url, headers=headers, stream=True) as r: + if r.status_code == 200: + with open(path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + # Check if this is HTML (common for Google Drive restrictions) + with open(path, 'rb') as f: + content_start = f.read(100).decode('utf-8', errors='ignore') + if '" == el_text.strip() or "→" == el_text.strip(): - logger.info(f"Clicking pagination control: {el_text}") - await el.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Get new links from this page - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking pagination: {e}") - - # Check for hidden links that might be revealed by JavaScript - hidden_links = await self.page.evaluate(""" - () => { - // Try to execute common JavaScript patterns that reveal hidden content - try { - // Common patterns used in websites to initially hide content - const hiddenContainers = document.querySelectorAll( - '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' - ); - - // Attempt to make them visible - hiddenContainers.forEach(el => { - el.style.display = 'block'; - el.style.visibility = 'visible'; - el.classList.remove('hidden', 'hide'); - }); - - // Return any newly visible links - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } catch (e) { - return []; - } - } - """) - - # Add any newly discovered links - for href in hidden_links: - if href and not href.startswith('javascript:'): - links.add(href) - - logger.info(f"Found {len(links)} sublinks") - return list(links)[:limit] - + return {'status': 'success', 'path': path} + else: + return {'status': 'error', 'message': f"HTTP error: {r.status_code}"} + except Exception as e: - logger.error(f"Error getting sublinks from {url}: {e}") - return list(links)[:limit] # Return what we have so far - - async def extract_all_link_types(self, links_set, base_url, path_base): - """Extract all types of links from the current page""" - # Get all tag links - a_links = await self.page.query_selector_all('a[href]') - for a in a_links: - try: - href = await a.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get iframe sources - iframes = await self.page.query_selector_all('iframe[src]') - for iframe in iframes: - try: - src = await iframe.get_attribute('src') - if src and not src.startswith('javascript:') and not src.startswith('about:'): - full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get links from onclick attributes that reference URLs - onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') - for el in onclick_elements: - try: - onclick = await el.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url in urls: - links_set.add(url) - except Exception: - pass - - # Look for URLs in data-* attributes - data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') - for el in data_elements: - for attr in ['data-url', 'data-href', 'data-src']: - try: - value = await el.get_attribute(attr) - if value and not value.startswith('javascript:'): - full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Look for special anchor links that might not have href attributes - special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') - for anchor in special_anchors: - try: - href = await anchor.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Extract links from JSON data embedded in the page - script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') - for script in script_elements: - try: - script_content = await script.text_content() - if script_content: - # Look for URLs in the JSON content - urls = re.findall(r'(https?://[^\'"]+)', script_content) - for url in urls: - links_set.add(url) - except Exception: - pass - - def resolve_relative_url(self, relative_url, base_url, path_base): - """Properly resolve relative URLs considering multiple formats""" - if relative_url.startswith('/'): - # Absolute path relative to domain - return f"{base_url}{relative_url}" - elif relative_url.startswith('./'): - # Explicit relative path - return f"{base_url}{path_base}/{relative_url[2:]}" - elif relative_url.startswith('../'): - # Parent directory - parent_path = '/'.join(path_base.split('/')[:-1]) - return f"{base_url}{parent_path}/{relative_url[3:]}" - else: - # Regular relative path - return f"{base_url}{path_base}/{relative_url}" + return {'status': 'error', 'message': str(e)} async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): + """Perform deep search for files on a website and its subpages""" if not custom_ext_list: custom_ext_list = [] + + # Create progress indicators progress_text = st.empty() progress_bar = st.progress(0) file_count_text = st.empty() try: progress_text.text("Analyzing main page...") - # Special handling for ASP.NET pages - is_aspnet = False - try: - await self.page.goto(url, timeout=30000, wait_until='networkidle') - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') - except Exception: - pass - # Extract files from main page + # Extract files from main page first main_files = await self.extract_downloadable_files(url, custom_ext_list) initial_count = len(main_files) file_count_text.text(f"Found {initial_count} files on main page") - # Get sublinks with enhanced method + # Get sublinks progress_text.text("Getting sublinks...") sublinks = await self.get_sublinks(url, sublink_limit) total_links = len(sublinks) progress_text.text(f"Found {total_links} sublinks to process") - # Always include files from the main page, regardless of sublinks - all_files = main_files - - if not sublinks: - progress_bar.progress(1.0) - return all_files + # Initialize all_files with main_files to ensure they're included + all_files = main_files.copy() # Process each sublink for i, sublink in enumerate(sublinks, 1): - progress = i / total_links + progress = i / max(total_links, 1) # Avoid division by zero progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") progress_bar.progress(progress) try: - # Use a longer timeout for ASP.NET pages which can be slower - sub_timeout = timeout * 2 if is_aspnet else timeout - # Extract files from sublink sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) all_files.extend(sub_files) @@ -2689,12 +1342,13 @@ class DownloadManager: seen_urls.add(f['url']) unique_files.append(f) - final_count = len(unique_files) + # Complete progress progress_text.text(f"Deep search complete!") - file_count_text.text(f"Found {final_count} unique files") + file_count_text.text(f"Found {len(unique_files)} unique files") progress_bar.progress(1.0) + return unique_files - + except Exception as e: logger.error(f"Deep search error: {e}") progress_text.text(f"Error during deep search: {str(e)}") @@ -2710,12 +1364,7 @@ class DownloadManager: def main(): st.title("Advanced File Downloader") - # Initialize playwright if needed - if "playwright_installed" not in st.session_state: - with st.spinner("Setting up browser automation. This may take a minute..."): - install_playwright_dependencies() - st.session_state.playwright_installed = True - + # Initialize session state if "initialized" not in st.session_state: st.session_state.initialized = True st.session_state.discovered_files = [] @@ -2725,17 +1374,44 @@ def main(): st.session_state.do_deep_search = False st.session_state.deep_search_url = None st.session_state.search_results = [] - + st.session_state.download_urls = {} # For direct download links + + # Install dependencies if needed + if "dependencies_installed" not in st.session_state: + with st.spinner("Setting up dependencies. This may take a minute..."): + st.session_state.dependencies_installed = setup_dependencies() + check_services() + + # Sidebar options with st.sidebar: - mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select") - with st.expander("Advanced Options", expanded=True): - custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt") - max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page") - sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink") - use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") + mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select") + + with st.expander("Search Options", expanded=True): + search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine") + browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine") + custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", + help="Enter extensions like .csv, .txt") + max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks") + sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout") + + with st.expander("Advanced Options", expanded=False): + use_proxy = st.checkbox("Use Proxy", key="use_proxy") proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") - use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox") - + use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth", + help="Makes browser harder to detect as automated") + enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"], + key="enable_intercept", + help="Intercept network traffic to find additional files") + if enable_network_intercept: + NETWORK_INTERCEPTOR_CONFIG["enabled"] = True + intercept_types = st.multiselect("Intercept Types", + ["xhr", "fetch", "document", "media", "stylesheet", "image", "font"], + default=["xhr", "fetch", "document", "media"], + key="intercept_types") + NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types + else: + NETWORK_INTERCEPTOR_CONFIG["enabled"] = False + with st.expander("Google Drive Integration", expanded=False): if st.button("Start Google Sign-In", key="google_signin_btn"): auth_url = get_google_auth_url() @@ -2745,97 +1421,75 @@ def main(): creds, msg = exchange_code_for_credentials(auth_code) st.session_state.google_creds = creds st.write(msg) - - with st.expander("Advanced Browser Settings", expanded=False): - # Captcha handling options - st.write("**Captcha Handling**") - captcha_option = st.radio( - "Captcha Detection:", - ["Auto-detect only", "Manual solve (shows captcha)"], - index=0, - key="captcha_option" - ) - - # Proxy rotation settings - st.write("**Proxy Rotation**") - enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation") - if enable_rotation: - PROXY_ROTATION_CONFIG["enabled"] = True - proxy_list = st.text_area( - "Proxy List (one per line)", - placeholder="http://proxy1:port\nhttp://proxy2:port", - key="proxy_list" - ) - if proxy_list: - PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()] - rotation_interval = st.slider( - "Rotation Interval (# of requests)", - min_value=1, - max_value=50, - value=10, - key="rotation_interval" - ) - PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval - + + # Main content area if mode == "Manual URL": st.header("Manual URL Mode") - url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") + url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input") + col1, col2 = st.columns([3, 1]) with col1: if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): if url: + # Process custom extensions custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] - valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] - if custom_ext_list != valid_ext_list: - st.warning("Invalid extensions ignored. Use format like '.csv'.") - @st.cache_resource - def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): - async def _run(): + with st.spinner("Searching for files..."): + async def run_deep_search(): async with DownloadManager( - use_proxy=use_proxy_val, - proxy=proxy_val, - use_stealth=use_stealth_val + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth ) as dm: - files = await dm.deep_search(url, ext_list, max_links, timeout_val) + files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) return files - return asyncio.run(_run()) - - with st.spinner("Searching for files..."): - files = run_deep_search(url, valid_ext_list, max_sublinks, - sublink_timeout, use_proxy, proxy, use_stealth) - - if files: - st.session_state.discovered_files = files - st.session_state.current_url = url - st.success(f"Found {len(files)} files!") - else: - st.warning("No files found.") - + + # Run the search + files = asyncio.run(run_deep_search()) + + if files: + st.session_state.discovered_files = files + st.session_state.current_url = url + st.success(f"Found {len(files)} files!") + else: + st.warning("No files found.") + + # Display and process discovered files if st.session_state.discovered_files: files = st.session_state.discovered_files - col1, col2 = st.columns([1, 4]) + + # Select/deselect buttons + col1, col2 = st.columns([1, 1]) with col1: if st.button("Select All", key="select_all_btn"): st.session_state.selected_files = list(range(len(files))) + with col2: if st.button("Clear Selection", key="clear_selection_btn"): st.session_state.selected_files = [] - # Create a formatted display of files with metadata + # Display file list with metadata file_options = [] for i, file in enumerate(files): filename = file['filename'] size = file['size'] meta = file.get('metadata', {}) - # Format display string with relevant metadata + # Format display info if meta and 'Pages' in meta: file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages" else: file_info = f"{filename} ({size})" - + file_options.append((i, file_info)) + + # Generate direct download URL for this file + if i not in st.session_state.download_urls: + # Generate a unique key for this file + file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode() + st.session_state.download_urls[i] = file_key + # File selection multiselect selected_indices = st.multiselect( "Select files to download", options=[i for i, _ in file_options], @@ -2846,215 +1500,341 @@ def main(): st.session_state.selected_files = selected_indices + # Display individual files with direct download links + if files: + st.subheader("Available Files") + for i, file in enumerate(files): + with st.expander(f"{i+1}. {file['filename']} ({file['size']})"): + st.write(f"Source: {file.get('source_url', 'Unknown')}") + st.write(f"URL: {file['url']}") + + # Download button for this specific file + if st.button(f"Download this file", key=f"download_single_{i}"): + with st.spinner(f"Downloading {file['filename']}..."): + # Create downloads directory + download_dir = "./downloads" + os.makedirs(download_dir, exist_ok=True) + + # Download the file + async def download_single(): + async with DownloadManager( + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth + ) as dm: + return await dm.download_file(file, download_dir) + + file_path = asyncio.run(download_single()) + + if file_path: + # Create a download link + with open(file_path, "rb") as f: + file_bytes = f.read() + + file_name = os.path.basename(file_path) + mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" + + st.download_button( + label=f"Download {file_name}", + data=file_bytes, + file_name=file_name, + mime=mime_type, + key=f"download_btn_{i}" + ) + + st.success(f"Downloaded successfully to {file_path}") + else: + st.error(f"Failed to download {file['filename']}") + + # Batch download options if selected_indices: + st.subheader("Batch Download Options") + col1, col2, col3, col4 = st.columns(4) with col1: download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") with col2: create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") with col3: - delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") + delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox") with col4: upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") - if st.button("Download Selected", key="download_btn"): - if not os.path.exists(download_dir): - os.makedirs(download_dir) - - async def download_files(): + if st.button("Download Selected Files", key="batch_download_btn"): + with st.spinner(f"Downloading {len(selected_indices)} files..."): + if not os.path.exists(download_dir): + os.makedirs(download_dir) + + # Start download process downloaded_paths = [] progress_bar = st.progress(0) status_text = st.empty() - async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - for i, idx in enumerate(selected_indices): - progress = (i + 1) / len(selected_indices) - file_info = files[idx] - status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") - progress_bar.progress(progress) + async def download_batch(): + async with DownloadManager( + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth + ) as dm: + paths = [] + for i, idx in enumerate(selected_indices): + file_info = files[idx] + progress = (i + 1) / len(selected_indices) + status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") + progress_bar.progress(progress) + + path = await dm.download_file(file_info, download_dir) + if path: + paths.append(path) - path = await dm.download_file(file_info, download_dir, url) - if path: - downloaded_paths.append(path) - - status_text.empty() - progress_bar.empty() - return downloaded_paths - - with st.spinner("Downloading files..."): - downloaded = asyncio.run(download_files()) - - if downloaded: - st.success(f"Successfully downloaded {len(downloaded)} files") + return paths - if create_zip: - zip_path = create_zip_file(downloaded, download_dir) - st.success(f"Created ZIP file: {zip_path}") - - # Provide download link for the zip file - with open(zip_path, "rb") as f: - zip_data = f.read() - - st.download_button( - label="Download ZIP", - data=zip_data, - file_name=os.path.basename(zip_path), - mime="application/zip", - key="download_zip_btn" - ) + downloaded_paths = asyncio.run(download_batch()) + status_text.empty() + progress_bar.empty() + + if downloaded_paths: + st.success(f"Successfully downloaded {len(downloaded_paths)} files") - # Upload to Google Drive if requested - if upload_to_drive and st.session_state.google_creds: - drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds) - folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}") - drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id) - if not isinstance(drive_id, str) or not drive_id.startswith("Error"): - st.success(f"Uploaded to Google Drive. File ID: {drive_id}") - else: - st.error(drive_id) - - # Delete original files if requested - if delete_after: - for path in downloaded: - try: - os.remove(path) - except Exception as e: - st.warning(f"Could not delete {path}: {e}") - st.info("Deleted original files after ZIP creation") - else: - # Provide individual file downloads - st.write("Download files individually:") - for path in downloaded: - with open(path, "rb") as f: - file_data = f.read() + if create_zip: + zip_path = create_zip_file(downloaded_paths, download_dir) + st.success(f"Created ZIP file: {zip_path}") - file_name = os.path.basename(path) - mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream" + # Provide download link for the zip file + with open(zip_path, "rb") as f: + zip_data = f.read() st.download_button( - label=f"Download {file_name}", - data=file_data, - file_name=file_name, - mime=mime_type, - key=f"download_file_{path}" + label="Download ZIP", + data=zip_data, + file_name=os.path.basename(zip_path), + mime="application/zip", + key="download_zip_btn" ) - - elif mode == "Bing Search": - st.header("Bing Search Mode") - query = st.text_input("Enter search query", key="search_query_input") - num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") + + # Upload to Google Drive if requested + if upload_to_drive and st.session_state.google_creds: + with st.spinner("Uploading to Google Drive..."): + drive_service = googleapiclient.discovery.build( + "drive", "v3", credentials=st.session_state.google_creds + ) + folder_id = create_drive_folder( + drive_service, f"Downloads_{get_domain(url)}" + ) + drive_id = google_drive_upload( + zip_path, st.session_state.google_creds, folder_id + ) + + if not isinstance(drive_id, str) or not drive_id.startswith("Error"): + st.success(f"Uploaded to Google Drive. File ID: {drive_id}") + else: + st.error(drive_id) + + # Delete original files if requested + if delete_after: + for path in downloaded_paths: + try: + os.remove(path) + except Exception as e: + st.warning(f"Could not delete {path}: {e}") + st.info("Deleted original files after ZIP creation") + + elif mode == "Web Search": + st.header("Web Search Mode") - if st.button("Search", key="search_btn"): + # Search query input + query = st.text_input("Enter search query", placeholder="example file type:pdf", key="search_query") + num_results = st.slider("Number of results", 1, 50, 10, key="num_results") + + if st.button("Search", key="web_search_btn"): if query: - async def run_search(): + with st.spinner("Searching the web..."): + async def run_search(): + async with DownloadManager( + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + query=query, + num_results=num_results, + use_stealth=use_stealth + ) as dm: + urls = await dm.search_web(search_engine) + return urls + + urls = asyncio.run(run_search()) + + if urls: + st.session_state.search_results = urls + st.success(f"Found {len(urls)} results!") + + # Display search results with deep search option + for i, url in enumerate(urls, 1): + with st.expander(f"Result {i}: {url}", expanded=(i == 1)): + st.write(f"URL: {url}") + if st.button(f"Search for files", key=f"search_result_{i}"): + st.session_state.deep_search_url = url + st.session_state.do_deep_search = True + else: + st.warning("No search results found.") + + # Handle deep search of a result if requested + if st.session_state.do_deep_search and st.session_state.deep_search_url: + url = st.session_state.deep_search_url + st.info(f"Searching for files on: {url}") + + # Reset the search flag to avoid re-running + st.session_state.do_deep_search = False + + # Process custom extensions + custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] + + with st.spinner("Searching for files..."): + async def deep_search_result(): async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - query=query, - num_results=num_results, + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, use_stealth=use_stealth ) as dm: - with st.spinner("Searching..."): - urls = await dm.search_bing() - if urls: - st.session_state.search_results = urls - st.success(f"Found {len(urls)} results!") - - # Create expanders for each result - for i, url in enumerate(urls, 1): - with st.expander(f"Result {i}: {url}", expanded=(i == 1)): - if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"): - st.session_state.deep_search_url = url - st.session_state.do_deep_search = True - else: - st.warning("No search results found.") - - asyncio.run(run_search()) - - # Handle deep search based on search results - if st.session_state.do_deep_search and st.session_state.deep_search_url: - url = st.session_state.deep_search_url - st.info(f"Deep searching: {url}") - - # Reset the flag to avoid re-running - st.session_state.do_deep_search = False + return await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) - # Set up custom extensions - custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] - valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] - - @st.cache_resource - def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): - async def _run(): - async with DownloadManager( - use_proxy=use_proxy_val, - proxy=proxy_val, - use_stealth=use_stealth_val - ) as dm: - files = await dm.deep_search(url, ext_list, max_links, timeout_val) - return files - return asyncio.run(_run()) - - with st.spinner("Searching for files..."): - files = run_deep_search(url, valid_ext_list, max_sublinks, - sublink_timeout, use_proxy, proxy, use_stealth) + files = asyncio.run(deep_search_result()) if files: st.session_state.discovered_files = files st.session_state.current_url = url st.success(f"Found {len(files)} files!") else: - st.warning("No files found.") - - # Add a special section for direct Google Drive file download - st.markdown("---") - with st.expander("Download View-Only Google Drive Document", expanded=False): - st.write("Download protected/view-only Google Drive documents - just enter the file ID") - file_id = st.text_input("Google Drive File ID", - placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku", - help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')") + st.warning("No files found on this page.") + + elif mode == "Single File": + st.header("Single File Download") - if st.button("Download Document") and file_id: - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") + # View-only Google Drive download + with st.expander("Download View-Only Google Drive Document", expanded=True): + st.write("Download protected/view-only Google Drive documents") + + file_id = st.text_input( + "Google Drive File ID", + placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view", + key="drive_file_id" + ) - with st.spinner("Downloading view-only document... (this may take a minute)"): - async def download_viewonly(): - async with DownloadManager(use_stealth=use_stealth) as dm: - file_info = { - 'url': f"https://drive.google.com/file/d/{file_id}/view", - 'filename': f"gdrive_{file_id}.pdf", - 'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True} - } - result_path = await dm.force_download_viewonly(file_info, output_path) - return result_path - - result = asyncio.run(download_viewonly()) - - if result: - st.success("Document downloaded successfully!") + if st.button("Download Document", key="drive_download_btn") and file_id: + with st.spinner("Downloading view-only document... (this may take a minute)"): + # Create download directory + download_dir = "./downloads" + os.makedirs(download_dir, exist_ok=True) - # Provide download button - with open(result, "rb") as f: - file_bytes = f.read() + # Set output path + output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") - st.download_button( - label="Download PDF", - data=file_bytes, - file_name=f"gdrive_{file_id}.pdf", - mime="application/pdf" - ) - else: - st.error("Failed to download the document. Please check the file ID and try again.") - - # Add footer with attribution - st.markdown('---') - st.markdown('Created by [Euler314](https://github.com/euler314)') + # Download the file + async def download_drive_file(): + async with DownloadManager( + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth + ) as dm: + file_info = { + 'url': f"https://drive.google.com/file/d/{file_id}/view", + 'filename': f"gdrive_{file_id}.pdf", + 'metadata': {'file_id': file_id, 'view_only': True} + } + return await dm.download_viewonly_google_drive(file_info, output_path) + + result_path = asyncio.run(download_drive_file()) + + if result_path: + st.success("Document downloaded successfully!") + + # Provide download link + with open(result_path, "rb") as f: + file_bytes = f.read() + + st.download_button( + label="Download PDF", + data=file_bytes, + file_name=os.path.basename(result_path), + mime="application/pdf", + key="drive_pdf_download" + ) + else: + st.error("Failed to download the document. Please check the file ID and try again.") + + # Direct URL download + with st.expander("Download from Direct URL", expanded=True): + st.write("Download a file from a direct URL") + + file_url = st.text_input( + "File URL", + placeholder="https://example.com/file.pdf", + key="direct_url" + ) + + file_name = st.text_input( + "Save as (optional)", + placeholder="Leave blank to use original filename", + key="save_filename" + ) + + if st.button("Download File", key="direct_download_btn") and file_url: + with st.spinner("Downloading file..."): + # Create download directory + download_dir = "./downloads" + os.makedirs(download_dir, exist_ok=True) + + # Determine filename + if not file_name: + file_name = os.path.basename(urlparse(file_url).path) + if not file_name or file_name == '/': + file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}" + + # Create file info + file_info = { + 'url': file_url, + 'filename': file_name, + 'metadata': {} + } + + # Download the file + async def download_direct_file(): + async with DownloadManager( + browser_engine=browser_engine, + use_proxy=use_proxy, + proxy=proxy, + use_stealth=use_stealth + ) as dm: + return await dm.download_file(file_info, download_dir) + + file_path = asyncio.run(download_direct_file()) + + if file_path: + st.success(f"File downloaded successfully to {file_path}") + + # Provide download link + with open(file_path, "rb") as f: + file_bytes = f.read() + + mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" + + st.download_button( + label=f"Download {os.path.basename(file_path)}", + data=file_bytes, + file_name=os.path.basename(file_path), + mime=mime_type, + key="direct_file_download" + ) + else: + st.error("Failed to download the file. Please check the URL and try again.") + + # Footer + st.markdown("---") + st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies") +# Run the app if __name__ == "__main__": main() \ No newline at end of file