|
import streamlit as st |
|
import os |
|
import asyncio |
|
import subprocess |
|
import tempfile |
|
import logging |
|
import time |
|
import json |
|
import base64 |
|
import re |
|
import random |
|
import zipfile |
|
import datetime |
|
import traceback |
|
import shutil |
|
import mimetypes |
|
from pathlib import Path |
|
from urllib.parse import urlparse, urljoin, unquote |
|
from io import BytesIO |
|
from PIL import Image |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
|
|
|
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
import googleapiclient.http |
|
import requests |
|
import celery |
|
from celery import Celery |
|
import splash |
|
import pyppeteer |
|
import mitmproxy |
|
from mitmproxy import http |
|
|
|
|
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
celery_app = Celery('file_downloader', broker='redis://localhost:6379/0') |
|
|
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", |
|
"project_id": "huggingface-449214", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "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", |
|
"redirect_uris": ["https://euler314-craw-web.hf.space/"] |
|
} |
|
} |
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', |
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
] |
|
|
|
|
|
PROXY_POOL = [] |
|
CURRENT_PROXY_INDEX = 0 |
|
|
|
|
|
NETWORK_INTERCEPTOR_CONFIG = { |
|
"enabled": False, |
|
"intercept_types": ["xhr", "fetch", "document", "media"], |
|
"save_intercepted": True, |
|
"intercept_folder": "./intercepted_data" |
|
} |
|
|
|
|
|
def get_random_user_agent(): |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
return zip_path |
|
|
|
def get_file_extension(url, default='.pdf'): |
|
"""Extract file extension from URL or filename""" |
|
path = urlparse(url).path |
|
ext = os.path.splitext(path)[1].lower() |
|
if not ext: |
|
return default |
|
return ext |
|
|
|
def get_domain(url): |
|
"""Extract domain from URL""" |
|
parsed = urlparse(url) |
|
return parsed.netloc |
|
|
|
def is_valid_file_url(url, extensions): |
|
"""Check if URL is a valid file URL based on extension""" |
|
return any(url.lower().endswith(ext) for ext in extensions) |
|
|
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def google_drive_upload(file_path, credentials, folder_id=None): |
|
try: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) |
|
file_metadata = {'name': os.path.basename(file_path)} |
|
if folder_id: |
|
file_metadata['parents'] = [folder_id] |
|
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) |
|
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() |
|
return created.get("id", "") |
|
except Exception as e: |
|
return f"Error uploading to Drive: {str(e)}" |
|
|
|
def create_drive_folder(drive_service, name): |
|
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} |
|
folder = drive_service.files().create(body=folder_metadata, fields='id').execute() |
|
return folder.get('id') |
|
|
|
|
|
def setup_dependencies(): |
|
"""Install required system dependencies""" |
|
try: |
|
|
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0', |
|
'redis-server', 'python3-dev', 'build-essential' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
|
|
|
|
subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True) |
|
|
|
|
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True) |
|
|
|
st.success("Dependencies installed successfully!") |
|
return True |
|
except Exception as e: |
|
st.error(f"Error installing dependencies: {e}") |
|
st.info("You may need to manually install dependencies. Check console for details.") |
|
logger.error(f"Setup error: {e}") |
|
traceback.print_exc() |
|
return False |
|
|
|
def check_services(): |
|
"""Check if required services are running""" |
|
try: |
|
|
|
redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG' |
|
if not redis_running: |
|
|
|
subprocess.run(['service', 'redis-server', 'start'], check=True) |
|
|
|
|
|
os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True) |
|
|
|
return True |
|
except Exception as e: |
|
logger.error(f"Service check error: {e}") |
|
return False |
|
|
|
|
|
class NetworkInterceptor: |
|
"""Class to intercept network traffic using mitmproxy""" |
|
|
|
def __init__(self, intercept_types=None, save_path=None): |
|
self.intercept_types = intercept_types or ["xhr", "fetch", "document"] |
|
self.save_path = save_path or "./intercepted_data" |
|
os.makedirs(self.save_path, exist_ok=True) |
|
self.captured_data = [] |
|
|
|
def intercept_request(self, flow): |
|
"""Process intercepted requests""" |
|
try: |
|
url = flow.request.url |
|
method = flow.request.method |
|
content_type = flow.request.headers.get("Content-Type", "") |
|
|
|
|
|
self.captured_data.append({ |
|
"type": "request", |
|
"url": url, |
|
"method": method, |
|
"headers": dict(flow.request.headers), |
|
"timestamp": time.time() |
|
}) |
|
|
|
logger.info(f"Intercepted {method} request to {url}") |
|
except Exception as e: |
|
logger.error(f"Error intercepting request: {e}") |
|
|
|
def intercept_response(self, flow): |
|
"""Process intercepted responses""" |
|
try: |
|
url = flow.request.url |
|
status_code = flow.response.status_code |
|
content_type = flow.response.headers.get("Content-Type", "") |
|
|
|
|
|
if any(t in content_type.lower() for t in ["application/pdf", "application/msword", |
|
"application/vnd.openxmlformats", |
|
"application/zip"]): |
|
|
|
filename = os.path.basename(urlparse(url).path) |
|
if not filename or filename == '/': |
|
filename = f"file_{int(time.time())}" |
|
|
|
|
|
if "pdf" in content_type: |
|
filename += ".pdf" |
|
elif "msword" in content_type: |
|
filename += ".doc" |
|
elif "openxmlformats" in content_type and "wordprocessingml" in content_type: |
|
filename += ".docx" |
|
elif "zip" in content_type: |
|
filename += ".zip" |
|
|
|
file_path = os.path.join(self.save_path, filename) |
|
with open(file_path, "wb") as f: |
|
f.write(flow.response.content) |
|
|
|
logger.info(f"Saved intercepted file: {file_path}") |
|
|
|
|
|
self.captured_data.append({ |
|
"type": "file", |
|
"url": url, |
|
"content_type": content_type, |
|
"size": len(flow.response.content), |
|
"path": file_path, |
|
"timestamp": time.time() |
|
}) |
|
except Exception as e: |
|
logger.error(f"Error intercepting response: {e}") |
|
|
|
def get_captured_files(self): |
|
"""Return list of captured files""" |
|
return [item for item in self.captured_data if item["type"] == "file"] |
|
|
|
|
|
class MultiEngineBrowser: |
|
"""Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)""" |
|
|
|
def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True): |
|
self.engine = engine |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.stealth = stealth |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
|
|
async def setup(self): |
|
"""Initialize browser based on selected engine""" |
|
if self.engine == "playwright": |
|
return await self.setup_playwright() |
|
elif self.engine == "pyppeteer": |
|
return await self.setup_pyppeteer() |
|
elif self.engine == "splash": |
|
return await self.setup_splash() |
|
else: |
|
raise ValueError(f"Unsupported browser engine: {self.engine}") |
|
|
|
async def setup_playwright(self): |
|
"""Setup Playwright browser""" |
|
from playwright.async_api import async_playwright |
|
|
|
self.playwright = await async_playwright().start() |
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
] |
|
|
|
if self.stealth: |
|
browser_args.extend([ |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-features=IsolateOrigins' |
|
]) |
|
|
|
launch_options = { |
|
"headless": True, |
|
"args": browser_args |
|
} |
|
|
|
if self.use_proxy and self.proxy: |
|
launch_options["proxy"] = {"server": self.proxy} |
|
|
|
self.browser = await self.playwright.chromium.launch(**launch_options) |
|
|
|
context_options = { |
|
"viewport": {"width": 1920, "height": 1080}, |
|
"user_agent": get_random_user_agent(), |
|
"bypass_csp": True, |
|
"ignore_https_errors": True, |
|
"accept_downloads": True |
|
} |
|
|
|
self.context = await self.browser.new_context(**context_options) |
|
|
|
|
|
if self.stealth: |
|
await self.context.add_init_script(""" |
|
Object.defineProperty(navigator, 'webdriver', { get: () => false }); |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) |
|
}); |
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); |
|
window.chrome = { runtime: {} }; |
|
""") |
|
|
|
self.page = await self.context.new_page() |
|
return self.page |
|
|
|
async def setup_pyppeteer(self): |
|
"""Setup Pyppeteer browser""" |
|
from pyppeteer import launch |
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
] |
|
|
|
if self.stealth: |
|
browser_args.extend([ |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-features=IsolateOrigins' |
|
]) |
|
|
|
launch_options = { |
|
"headless": True, |
|
"args": browser_args, |
|
"ignoreHTTPSErrors": True, |
|
"userDataDir": tempfile.mkdtemp() |
|
} |
|
|
|
if self.use_proxy and self.proxy: |
|
browser_args.append(f'--proxy-server={self.proxy}') |
|
|
|
self.browser = await launch(launch_options) |
|
self.page = await self.browser.newPage() |
|
|
|
|
|
await self.page.setUserAgent(get_random_user_agent()) |
|
|
|
|
|
await self.page.setViewport({"width": 1920, "height": 1080}) |
|
|
|
|
|
if self.stealth: |
|
await self.page.evaluateOnNewDocument(""" |
|
Object.defineProperty(navigator, 'webdriver', { get: () => false }); |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) |
|
}); |
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); |
|
window.chrome = { runtime: {} }; |
|
""") |
|
|
|
return self.page |
|
|
|
async def setup_splash(self): |
|
"""Setup Splash browser through API""" |
|
|
|
|
|
self.splash_url = "http://localhost:8050/render.html" |
|
return None |
|
|
|
async def goto(self, url, wait_until=None, timeout=30000): |
|
"""Navigate to a URL""" |
|
if self.engine == "playwright": |
|
return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout) |
|
elif self.engine == "pyppeteer": |
|
return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout) |
|
elif self.engine == "splash": |
|
|
|
params = { |
|
"url": url, |
|
"wait": min(timeout/1000, 30), |
|
"timeout": min(timeout/1000, 60), |
|
"resource_timeout": min(timeout/1000, 30), |
|
"html": 1, |
|
"png": 0, |
|
"render_all": 1 |
|
} |
|
|
|
if self.use_proxy and self.proxy: |
|
params["proxy"] = self.proxy |
|
|
|
headers = {"User-Agent": get_random_user_agent()} |
|
response = requests.get(self.splash_url, params=params, headers=headers) |
|
self.last_html = response.text |
|
return response |
|
|
|
async def content(self): |
|
"""Get page content""" |
|
if self.engine == "playwright": |
|
return await self.page.content() |
|
elif self.engine == "pyppeteer": |
|
return await self.page.content() |
|
elif self.engine == "splash": |
|
return self.last_html |
|
|
|
async def close(self): |
|
"""Close browser""" |
|
if self.engine == "playwright": |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
elif self.engine == "pyppeteer": |
|
if self.browser: |
|
await self.browser.close() |
|
|
|
|
|
|
|
class DownloadManager: |
|
def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True): |
|
self.browser_engine = browser_engine |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.use_stealth = use_stealth |
|
self.browser = None |
|
self.network_interceptor = None |
|
|
|
|
|
if NETWORK_INTERCEPTOR_CONFIG["enabled"]: |
|
self.network_interceptor = NetworkInterceptor( |
|
intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"], |
|
save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"] |
|
) |
|
|
|
async def __aenter__(self): |
|
|
|
self.browser = MultiEngineBrowser( |
|
engine=self.browser_engine, |
|
use_proxy=self.use_proxy, |
|
proxy=self.proxy, |
|
stealth=self.use_stealth |
|
) |
|
self.page = await self.browser.setup() |
|
|
|
|
|
if self.browser_engine == "playwright": |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'DNT': '1', |
|
'Referer': 'https://www.google.com/', |
|
'Sec-Fetch-Dest': 'document', |
|
'Sec-Fetch-Mode': 'navigate', |
|
'Sec-Fetch-Site': 'cross-site', |
|
'Sec-Fetch-User': '?1', |
|
'Upgrade-Insecure-Requests': '1' |
|
}) |
|
|
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
await self.browser.close() |
|
|
|
async def search_web(self, search_engine="bing"): |
|
"""Search web using specified search engine""" |
|
urls = [] |
|
try: |
|
if search_engine == "bing": |
|
search_url = f"https://www.bing.com/search?q={self.query}" |
|
elif search_engine == "google": |
|
search_url = f"https://www.google.com/search?q={self.query}" |
|
else: |
|
raise ValueError(f"Unsupported search engine: {search_engine}") |
|
|
|
await self.browser.goto(search_url, timeout=30000) |
|
|
|
if self.browser_engine == "playwright": |
|
if search_engine == "bing": |
|
links = await self.page.query_selector_all("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
elif search_engine == "google": |
|
links = await self.page.query_selector_all("div.g a[href^='http']") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
elif self.browser_engine == "pyppeteer": |
|
if search_engine == "bing": |
|
links = await self.page.querySelectorAll("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await self.page.evaluate('el => el.getAttribute("href")', link) |
|
if href: |
|
urls.append(href) |
|
elif search_engine == "google": |
|
links = await self.page.querySelectorAll("div.g a[href^='http']") |
|
for link in links[:self.num_results]: |
|
href = await self.page.evaluate('el => el.getAttribute("href")', link) |
|
if href: |
|
urls.append(href) |
|
elif self.browser_engine == "splash": |
|
|
|
soup = BeautifulSoup(self.browser.last_html, 'html.parser') |
|
if search_engine == "bing": |
|
links = soup.select("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = link.get("href") |
|
if href: |
|
urls.append(href) |
|
elif search_engine == "google": |
|
links = soup.select("div.g a[href^='http']") |
|
for link in links[:self.num_results]: |
|
href = link.get("href") |
|
if href: |
|
urls.append(href) |
|
|
|
return urls |
|
except Exception as e: |
|
logger.error(f"Error searching web: {e}") |
|
return [] |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
headers = {'User-Agent': get_random_user_agent()} |
|
response = requests.head(url, headers=headers, timeout=15) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception: |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
headers = {'User-Agent': get_random_user_agent()} |
|
response = requests.get(url, headers=headers, timeout=15, stream=True) |
|
if response.status_code == 200: |
|
content = BytesIO(response.content) |
|
reader = PdfReader(content) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception: |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
try: |
|
headers = {'User-Agent': get_random_user_agent()} |
|
response = requests.head(url, headers=headers, timeout=15, allow_redirects=True) |
|
return response.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
async def get_edu_exam_links(self, url): |
|
"""Specialized method for educational exam websites that follows a common pattern.""" |
|
try: |
|
logger.info(f"Fetching exam links from {url}") |
|
links = set() |
|
|
|
|
|
headers = {"User-Agent": get_random_user_agent()} |
|
try: |
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
link_text = a.get_text().lower() |
|
|
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_", |
|
"/resource/", "/subject/", "/course/", "/material/" |
|
] |
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"subject", "course", "resource", "pdf", "document", |
|
"view", "open", "get", "solution", "answer" |
|
] |
|
|
|
|
|
if any(pattern in full_url.lower() for pattern in url_patterns) or \ |
|
any(pattern in link_text for pattern in text_patterns) or \ |
|
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Request-based extraction failed: {e}") |
|
|
|
|
|
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: |
|
logger.info("Using browser for enhanced link extraction") |
|
|
|
|
|
await self.browser.goto(url, timeout=45000) |
|
|
|
|
|
content = await self.browser.content() |
|
soup = BeautifulSoup(content, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
link_text = a.get_text().lower() |
|
|
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_", |
|
"/resource/", "/subject/", "/course/", "/material/" |
|
] |
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"subject", "course", "resource", "pdf", "document", |
|
"view", "open", "get", "solution", "answer" |
|
] |
|
|
|
|
|
if any(pattern in full_url.lower() for pattern in url_patterns) or \ |
|
any(pattern in link_text for pattern in text_patterns) or \ |
|
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
|
|
|
|
filtered_links = [] |
|
for link in links: |
|
|
|
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if any(pattern in link.lower() for pattern in [ |
|
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", |
|
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", |
|
"/resource/", "/material/", "/notes/", "/subjectmaterial/" |
|
]): |
|
filtered_links.append(link) |
|
|
|
logger.info(f"Found {len(filtered_links)} potential exam document links") |
|
return filtered_links |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting exam links: {e}") |
|
return [] |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized handler for educational exam site") |
|
|
|
|
|
exam_links = await self.get_edu_exam_links(url) |
|
|
|
for link in exam_links: |
|
|
|
real_url = await self.extract_real_download_url(link) |
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/': |
|
domain = get_domain(real_url) |
|
ext = get_file_extension(real_url, '.pdf') |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
meta = {} |
|
if real_url.lower().endswith('.pdf'): |
|
try: |
|
meta = await self.get_pdf_metadata(real_url) |
|
except Exception: |
|
pass |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': meta, |
|
'source_url': url |
|
}) |
|
|
|
|
|
if found_files: |
|
return found_files |
|
|
|
|
|
await self.browser.goto(url, timeout=30000) |
|
|
|
|
|
content = await self.browser.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', |
|
'.pptx', '.odt', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
|
|
if '.php' in href.lower() or 'download' in href.lower(): |
|
full_url = href if href.startswith('http') else urljoin(base_url, href) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file' |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {}, |
|
'source_url': url |
|
}) |
|
continue |
|
|
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else urljoin(base_url, href) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'source_url': url |
|
}) |
|
|
|
|
|
elif ("drive.google.com" in href) or ("docs.google.com" in href): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if file_id: |
|
|
|
is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")) |
|
|
|
filename = f"gdrive_{file_id}" |
|
ext = get_file_extension(href, '.pdf') |
|
if ext != '.': |
|
filename += ext |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': filename, |
|
'size': "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"), |
|
'metadata': { |
|
'view_only': is_view_only, |
|
'file_id': file_id |
|
}, |
|
'source_url': url |
|
}) |
|
|
|
|
|
for elem_tag in ['iframe', 'embed', 'object', 'source']: |
|
for elem in soup.find_all(elem_tag): |
|
src = elem.get('src') or elem.get('data') |
|
if src and any(src.lower().endswith(ext) for ext in all_exts): |
|
file_url = src if src.startswith('http') else urljoin(base_url, src) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': await self.get_file_size(file_url), |
|
'metadata': {}, |
|
'source_url': url |
|
}) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer=None): |
|
"""Download a file and provide a direct download link""" |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
referer = referer or file_info.get('source_url', 'https://www.google.com') |
|
|
|
|
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
is_view_only = file_info.get('metadata', {}).get('view_only', False) |
|
if is_view_only: |
|
result_path = await self.download_viewonly_google_drive(file_info, path) |
|
if result_path: |
|
return result_path |
|
|
|
|
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, file_url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if file_id: |
|
|
|
download_url = f"https://drive.google.com/uc?id={file_id}&export=download" |
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Referer': referer |
|
} |
|
|
|
with requests.get(download_url, headers=headers, stream=True) as r: |
|
r.raise_for_status() |
|
with open(path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
return path |
|
|
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Referer': referer, |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br' |
|
} |
|
|
|
with requests.get(file_url, headers=headers, stream=True) as r: |
|
r.raise_for_status() |
|
with open(path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
return path |
|
else: |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def download_viewonly_google_drive(self, file_info, save_path): |
|
"""Download view-only Google Drive documents""" |
|
try: |
|
|
|
file_id = file_info.get('metadata', {}).get('file_id') |
|
if not file_id: |
|
url = file_info['url'] |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error("Could not extract file ID") |
|
return None |
|
|
|
|
|
file_type = get_file_extension(file_info['url'], '.pdf').lstrip('.') |
|
|
|
|
|
base, ext = os.path.splitext(save_path) |
|
if not ext: |
|
save_path = f"{base}.{file_type}" |
|
|
|
logger.info(f"Downloading view-only Google Drive file: {file_id}") |
|
|
|
|
|
if self.browser_engine == "playwright": |
|
from playwright.async_api import async_playwright |
|
|
|
async with async_playwright() as p: |
|
browser = await p.chromium.launch( |
|
headless=True, |
|
args=[ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-site-isolation-trials', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent=get_random_user_agent(), |
|
accept_downloads=True, |
|
ignore_https_errors=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
Object.defineProperty(navigator, 'webdriver', { get: () => false }); |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 })) |
|
}); |
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); |
|
window.chrome = { runtime: {} }; |
|
""") |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
if file_type == 'pdf': |
|
|
|
screenshots_dir = os.path.join(temp_dir, "screenshots") |
|
os.makedirs(screenshots_dir, exist_ok=True) |
|
|
|
|
|
total_pages = await page.evaluate(""" |
|
() => { |
|
// Look for page counters in the interface |
|
const pageCounters = document.querySelectorAll('*'); |
|
for (const el of pageCounters) { |
|
const text = el.textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) { |
|
return parseInt(match[2]); |
|
} |
|
} |
|
|
|
// Look for paginated pages |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length > 0) return pages.length; |
|
|
|
// Default if we can't determine |
|
return 20; |
|
} |
|
""") |
|
|
|
logger.info(f"PDF has approximately {total_pages} pages") |
|
|
|
|
|
screenshots = [] |
|
|
|
|
|
for i in range(min(total_pages, 100)): |
|
try: |
|
|
|
if i > 0: |
|
await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()") |
|
await page.wait_for_timeout(500) |
|
|
|
|
|
await page.wait_for_timeout(500) |
|
|
|
|
|
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") |
|
|
|
|
|
page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})') |
|
if page_element: |
|
await page_element.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
screenshots.append(screenshot_path) |
|
|
|
|
|
if i < total_pages - 1: |
|
next_button = await page.query_selector('button[aria-label="Next page"]') |
|
if next_button: |
|
|
|
is_disabled = await next_button.get_attribute('disabled') |
|
if is_disabled: |
|
logger.info(f"Reached last page at page {i+1}") |
|
break |
|
|
|
|
|
await next_button.click() |
|
await page.wait_for_timeout(1000) |
|
else: |
|
logger.info("Next page button not found") |
|
break |
|
except Exception as e: |
|
logger.error(f"Error capturing page {i+1}: {e}") |
|
continue |
|
|
|
|
|
if screenshots: |
|
|
|
first_img = Image.open(screenshots[0]) |
|
width, height = first_img.size |
|
|
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
for screenshot in screenshots: |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
|
|
|
|
shutil.rmtree(temp_dir, ignore_errors=True) |
|
|
|
return save_path |
|
else: |
|
logger.error("No screenshots captured") |
|
else: |
|
|
|
screenshot_path = os.path.join(temp_dir, "file.png") |
|
await page.screenshot(path=screenshot_path) |
|
|
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
|
|
os.remove(screenshot_path) |
|
shutil.rmtree(temp_dir, ignore_errors=True) |
|
|
|
return save_path |
|
finally: |
|
await browser.close() |
|
elif self.browser_engine == "pyppeteer": |
|
|
|
pass |
|
|
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading view-only file: {e}") |
|
return None |
|
|
|
async def get_sublinks(self, url, limit=10000): |
|
"""Extract all sublinks from a website""" |
|
links = set() |
|
try: |
|
logger.info(f"Extracting sublinks from {url}") |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
edu_links = await self.get_edu_exam_links(url) |
|
for link in edu_links: |
|
links.add(link) |
|
|
|
if len(links) > 5: |
|
logger.info(f"Found {len(links)} sublinks with specialized method") |
|
return list(links)[:limit] |
|
|
|
|
|
await self.browser.goto(url, timeout=30000) |
|
|
|
|
|
content = await self.browser.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'] |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
|
|
if href.startswith('/'): |
|
full_url = f"{base_url}{href}" |
|
elif href.startswith('http'): |
|
full_url = href |
|
else: |
|
full_url = urljoin(url, href) |
|
|
|
links.add(full_url) |
|
|
|
|
|
for iframe in soup.find_all('iframe', src=True): |
|
src = iframe['src'] |
|
if src and not src.startswith('javascript:') and not src.startswith('about:'): |
|
full_url = src if src.startswith('http') else urljoin(url, src) |
|
links.add(full_url) |
|
|
|
return list(links)[:limit] |
|
except Exception as e: |
|
logger.error(f"Error extracting sublinks: {e}") |
|
return list(links)[:limit] |
|
|
|
@celery_app.task |
|
def download_file_task(file_info, save_dir, referer=None): |
|
"""Celery task for downloading files asynchronously""" |
|
|
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
referer = referer or file_info.get('source_url', 'https://www.google.com') |
|
|
|
|
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, file_url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if file_id: |
|
|
|
download_url = f"https://drive.google.com/uc?id={file_id}&export=download" |
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Referer': referer |
|
} |
|
|
|
with requests.get(download_url, headers=headers, stream=True) as r: |
|
if r.status_code == 200: |
|
with open(path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
|
|
with open(path, 'rb') as f: |
|
content_start = f.read(100).decode('utf-8', errors='ignore') |
|
if '<html' in content_start.lower(): |
|
os.remove(path) |
|
return {'status': 'error', 'message': 'Received HTML instead of file'} |
|
|
|
return {'status': 'success', 'path': path} |
|
|
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Referer': referer, |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br' |
|
} |
|
|
|
with requests.get(file_url, headers=headers, stream=True) as r: |
|
if r.status_code == 200: |
|
with open(path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
return {'status': 'success', 'path': path} |
|
else: |
|
return {'status': 'error', 'message': f"HTTP error: {r.status_code}"} |
|
|
|
except Exception as e: |
|
return {'status': 'error', 'message': str(e)} |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): |
|
"""Perform deep search for files on a website and its subpages""" |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
|
|
|
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
progress_text.text("Analyzing main page...") |
|
|
|
|
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
|
|
all_files = main_files.copy() |
|
|
|
|
|
for i, sublink in enumerate(sublinks, 1): |
|
progress = i / max(total_links, 1) |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
try: |
|
|
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
except Exception as e: |
|
logger.warning(f"Error processing sublink {sublink}: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
|
|
progress_text.text(f"Deep search complete!") |
|
file_count_text.text(f"Found {len(unique_files)} unique files") |
|
progress_bar.progress(1.0) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
await asyncio.sleep(2) |
|
if not st.session_state.get('keep_progress', False): |
|
progress_text.empty() |
|
progress_bar.empty() |
|
|
|
|
|
def main(): |
|
st.title("Advanced File Downloader") |
|
|
|
|
|
if "initialized" not in st.session_state: |
|
st.session_state.initialized = True |
|
st.session_state.discovered_files = [] |
|
st.session_state.current_url = None |
|
st.session_state.google_creds = None |
|
st.session_state.selected_files = [] |
|
st.session_state.do_deep_search = False |
|
st.session_state.deep_search_url = None |
|
st.session_state.search_results = [] |
|
st.session_state.download_urls = {} |
|
|
|
|
|
if "dependencies_installed" not in st.session_state: |
|
with st.spinner("Setting up dependencies. This may take a minute..."): |
|
st.session_state.dependencies_installed = setup_dependencies() |
|
check_services() |
|
|
|
|
|
with st.sidebar: |
|
mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select") |
|
|
|
with st.expander("Search Options", expanded=True): |
|
search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine") |
|
browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine") |
|
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", |
|
help="Enter extensions like .csv, .txt") |
|
max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks") |
|
sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout") |
|
|
|
with st.expander("Advanced Options", expanded=False): |
|
use_proxy = st.checkbox("Use Proxy", key="use_proxy") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") |
|
use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth", |
|
help="Makes browser harder to detect as automated") |
|
enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"], |
|
key="enable_intercept", |
|
help="Intercept network traffic to find additional files") |
|
if enable_network_intercept: |
|
NETWORK_INTERCEPTOR_CONFIG["enabled"] = True |
|
intercept_types = st.multiselect("Intercept Types", |
|
["xhr", "fetch", "document", "media", "stylesheet", "image", "font"], |
|
default=["xhr", "fetch", "document", "media"], |
|
key="intercept_types") |
|
NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types |
|
else: |
|
NETWORK_INTERCEPTOR_CONFIG["enabled"] = False |
|
|
|
with st.expander("Google Drive Integration", expanded=False): |
|
if st.button("Start Google Sign-In", key="google_signin_btn"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
auth_code = st.text_input("Enter authorization code", key="auth_code_input") |
|
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.google_creds = creds |
|
st.write(msg) |
|
|
|
|
|
if mode == "Manual URL": |
|
st.header("Manual URL Mode") |
|
url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input") |
|
|
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): |
|
if url: |
|
|
|
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] |
|
|
|
with st.spinner("Searching for files..."): |
|
async def run_deep_search(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) |
|
return files |
|
|
|
|
|
files = asyncio.run(run_deep_search()) |
|
|
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
|
|
|
|
if st.session_state.discovered_files: |
|
files = st.session_state.discovered_files |
|
|
|
|
|
col1, col2 = st.columns([1, 1]) |
|
with col1: |
|
if st.button("Select All", key="select_all_btn"): |
|
st.session_state.selected_files = list(range(len(files))) |
|
with col2: |
|
if st.button("Clear Selection", key="clear_selection_btn"): |
|
st.session_state.selected_files = [] |
|
|
|
|
|
file_options = [] |
|
for i, file in enumerate(files): |
|
filename = file['filename'] |
|
size = file['size'] |
|
meta = file.get('metadata', {}) |
|
|
|
|
|
if meta and 'Pages' in meta: |
|
file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages" |
|
else: |
|
file_info = f"{filename} ({size})" |
|
|
|
file_options.append((i, file_info)) |
|
|
|
|
|
if i not in st.session_state.download_urls: |
|
|
|
file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode() |
|
st.session_state.download_urls[i] = file_key |
|
|
|
|
|
selected_indices = st.multiselect( |
|
"Select files to download", |
|
options=[i for i, _ in file_options], |
|
default=st.session_state.selected_files, |
|
format_func=lambda i: next(info for idx, info in file_options if idx == i), |
|
key="file_multiselect" |
|
) |
|
|
|
st.session_state.selected_files = selected_indices |
|
|
|
|
|
if files: |
|
st.subheader("Available Files") |
|
for i, file in enumerate(files): |
|
with st.expander(f"{i+1}. {file['filename']} ({file['size']})"): |
|
st.write(f"Source: {file.get('source_url', 'Unknown')}") |
|
st.write(f"URL: {file['url']}") |
|
|
|
|
|
if st.button(f"Download this file", key=f"download_single_{i}"): |
|
with st.spinner(f"Downloading {file['filename']}..."): |
|
|
|
download_dir = "./downloads" |
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
|
|
async def download_single(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
return await dm.download_file(file, download_dir) |
|
|
|
file_path = asyncio.run(download_single()) |
|
|
|
if file_path: |
|
|
|
with open(file_path, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
file_name = os.path.basename(file_path) |
|
mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" |
|
|
|
st.download_button( |
|
label=f"Download {file_name}", |
|
data=file_bytes, |
|
file_name=file_name, |
|
mime=mime_type, |
|
key=f"download_btn_{i}" |
|
) |
|
|
|
st.success(f"Downloaded successfully to {file_path}") |
|
else: |
|
st.error(f"Failed to download {file['filename']}") |
|
|
|
|
|
if selected_indices: |
|
st.subheader("Batch Download Options") |
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") |
|
with col2: |
|
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") |
|
with col3: |
|
delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox") |
|
with col4: |
|
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") |
|
|
|
if st.button("Download Selected Files", key="batch_download_btn"): |
|
with st.spinner(f"Downloading {len(selected_indices)} files..."): |
|
if not os.path.exists(download_dir): |
|
os.makedirs(download_dir) |
|
|
|
|
|
downloaded_paths = [] |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
async def download_batch(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
paths = [] |
|
for i, idx in enumerate(selected_indices): |
|
file_info = files[idx] |
|
progress = (i + 1) / len(selected_indices) |
|
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") |
|
progress_bar.progress(progress) |
|
|
|
path = await dm.download_file(file_info, download_dir) |
|
if path: |
|
paths.append(path) |
|
|
|
return paths |
|
|
|
downloaded_paths = asyncio.run(download_batch()) |
|
status_text.empty() |
|
progress_bar.empty() |
|
|
|
if downloaded_paths: |
|
st.success(f"Successfully downloaded {len(downloaded_paths)} files") |
|
|
|
if create_zip: |
|
zip_path = create_zip_file(downloaded_paths, download_dir) |
|
st.success(f"Created ZIP file: {zip_path}") |
|
|
|
|
|
with open(zip_path, "rb") as f: |
|
zip_data = f.read() |
|
|
|
st.download_button( |
|
label="Download ZIP", |
|
data=zip_data, |
|
file_name=os.path.basename(zip_path), |
|
mime="application/zip", |
|
key="download_zip_btn" |
|
) |
|
|
|
|
|
if upload_to_drive and st.session_state.google_creds: |
|
with st.spinner("Uploading to Google Drive..."): |
|
drive_service = googleapiclient.discovery.build( |
|
"drive", "v3", credentials=st.session_state.google_creds |
|
) |
|
folder_id = create_drive_folder( |
|
drive_service, f"Downloads_{get_domain(url)}" |
|
) |
|
drive_id = google_drive_upload( |
|
zip_path, st.session_state.google_creds, folder_id |
|
) |
|
|
|
if not isinstance(drive_id, str) or not drive_id.startswith("Error"): |
|
st.success(f"Uploaded to Google Drive. File ID: {drive_id}") |
|
else: |
|
st.error(drive_id) |
|
|
|
|
|
if delete_after: |
|
for path in downloaded_paths: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
st.warning(f"Could not delete {path}: {e}") |
|
st.info("Deleted original files after ZIP creation") |
|
|
|
elif mode == "Web Search": |
|
st.header("Web Search Mode") |
|
|
|
|
|
query = st.text_input("Enter search query", placeholder="example file type:pdf", key="search_query") |
|
num_results = st.slider("Number of results", 1, 50, 10, key="num_results") |
|
|
|
if st.button("Search", key="web_search_btn"): |
|
if query: |
|
with st.spinner("Searching the web..."): |
|
async def run_search(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
query=query, |
|
num_results=num_results, |
|
use_stealth=use_stealth |
|
) as dm: |
|
urls = await dm.search_web(search_engine) |
|
return urls |
|
|
|
urls = asyncio.run(run_search()) |
|
|
|
if urls: |
|
st.session_state.search_results = urls |
|
st.success(f"Found {len(urls)} results!") |
|
|
|
|
|
for i, url in enumerate(urls, 1): |
|
with st.expander(f"Result {i}: {url}", expanded=(i == 1)): |
|
st.write(f"URL: {url}") |
|
if st.button(f"Search for files", key=f"search_result_{i}"): |
|
st.session_state.deep_search_url = url |
|
st.session_state.do_deep_search = True |
|
else: |
|
st.warning("No search results found.") |
|
|
|
|
|
if st.session_state.do_deep_search and st.session_state.deep_search_url: |
|
url = st.session_state.deep_search_url |
|
st.info(f"Searching for files on: {url}") |
|
|
|
|
|
st.session_state.do_deep_search = False |
|
|
|
|
|
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] |
|
|
|
with st.spinner("Searching for files..."): |
|
async def deep_search_result(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
return await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout) |
|
|
|
files = asyncio.run(deep_search_result()) |
|
|
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found on this page.") |
|
|
|
elif mode == "Single File": |
|
st.header("Single File Download") |
|
|
|
|
|
with st.expander("Download View-Only Google Drive Document", expanded=True): |
|
st.write("Download protected/view-only Google Drive documents") |
|
|
|
file_id = st.text_input( |
|
"Google Drive File ID", |
|
placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view", |
|
key="drive_file_id" |
|
) |
|
|
|
if st.button("Download Document", key="drive_download_btn") and file_id: |
|
with st.spinner("Downloading view-only document... (this may take a minute)"): |
|
|
|
download_dir = "./downloads" |
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
|
|
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") |
|
|
|
|
|
async def download_drive_file(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
file_info = { |
|
'url': f"https://drive.google.com/file/d/{file_id}/view", |
|
'filename': f"gdrive_{file_id}.pdf", |
|
'metadata': {'file_id': file_id, 'view_only': True} |
|
} |
|
return await dm.download_viewonly_google_drive(file_info, output_path) |
|
|
|
result_path = asyncio.run(download_drive_file()) |
|
|
|
if result_path: |
|
st.success("Document downloaded successfully!") |
|
|
|
|
|
with open(result_path, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
st.download_button( |
|
label="Download PDF", |
|
data=file_bytes, |
|
file_name=os.path.basename(result_path), |
|
mime="application/pdf", |
|
key="drive_pdf_download" |
|
) |
|
else: |
|
st.error("Failed to download the document. Please check the file ID and try again.") |
|
|
|
|
|
with st.expander("Download from Direct URL", expanded=True): |
|
st.write("Download a file from a direct URL") |
|
|
|
file_url = st.text_input( |
|
"File URL", |
|
placeholder="https://example.com/file.pdf", |
|
key="direct_url" |
|
) |
|
|
|
file_name = st.text_input( |
|
"Save as (optional)", |
|
placeholder="Leave blank to use original filename", |
|
key="save_filename" |
|
) |
|
|
|
if st.button("Download File", key="direct_download_btn") and file_url: |
|
with st.spinner("Downloading file..."): |
|
|
|
download_dir = "./downloads" |
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
|
|
if not file_name: |
|
file_name = os.path.basename(urlparse(file_url).path) |
|
if not file_name or file_name == '/': |
|
file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}" |
|
|
|
|
|
file_info = { |
|
'url': file_url, |
|
'filename': file_name, |
|
'metadata': {} |
|
} |
|
|
|
|
|
async def download_direct_file(): |
|
async with DownloadManager( |
|
browser_engine=browser_engine, |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
return await dm.download_file(file_info, download_dir) |
|
|
|
file_path = asyncio.run(download_direct_file()) |
|
|
|
if file_path: |
|
st.success(f"File downloaded successfully to {file_path}") |
|
|
|
|
|
with open(file_path, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" |
|
|
|
st.download_button( |
|
label=f"Download {os.path.basename(file_path)}", |
|
data=file_bytes, |
|
file_name=os.path.basename(file_path), |
|
mime=mime_type, |
|
key="direct_file_download" |
|
) |
|
else: |
|
st.error("Failed to download the file. Please check the URL and try again.") |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |