|
import streamlit as st |
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
|
|
|
|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse, urljoin, unquote |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
import datetime |
|
import traceback |
|
import base64 |
|
import shutil |
|
import json |
|
import time |
|
from PIL import Image |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
import googleapiclient.http |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", |
|
"project_id": "huggingface-449214", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "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", |
|
"redirect_uris": ["https://euler314-craw-web.hf.space/"] |
|
} |
|
} |
|
|
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', |
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' |
|
] |
|
|
|
|
|
STEALTH_SETTINGS = { |
|
|
|
"hardware_concurrency": 4, |
|
"device_memory": 8, |
|
|
|
"webgl_vendor": "Google Inc. (Intel)", |
|
"webgl_renderer": "Intel Iris OpenGL Engine", |
|
"languages": ["en-US", "en"], |
|
"disable_webrtc": True, |
|
|
|
"navigator_platform": "Win32", |
|
"touch_support": False |
|
} |
|
|
|
|
|
PROXY_ROTATION_CONFIG = { |
|
"enabled": False, |
|
"rotation_interval": 10, |
|
"proxies": [] |
|
} |
|
|
|
|
|
def get_random_user_agent(): |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
return zip_path |
|
|
|
def get_file_extension(url, default='.pdf'): |
|
"""Extract file extension from URL or filename""" |
|
path = urlparse(url).path |
|
ext = os.path.splitext(path)[1].lower() |
|
if not ext: |
|
return default |
|
return ext |
|
|
|
def humanize_file_size(size_bytes): |
|
"""Format file size in human-readable format""" |
|
if size_bytes < 1024: |
|
return f"{size_bytes} bytes" |
|
for unit in ['KB', 'MB', 'GB', 'TB']: |
|
size_bytes /= 1024.0 |
|
if size_bytes < 1024.0: |
|
return f"{size_bytes:.1f} {unit}" |
|
return f"{size_bytes:.1f} PB" |
|
|
|
def get_domain(url): |
|
"""Extract domain from URL""" |
|
parsed = urlparse(url) |
|
return parsed.netloc |
|
|
|
def is_valid_file_url(url, extensions): |
|
"""Check if URL is a valid file URL based on extension""" |
|
return any(url.lower().endswith(ext) for ext in extensions) |
|
|
|
def detect_captcha(html_content): |
|
"""Detect common captcha patterns in HTML content""" |
|
captcha_patterns = [ |
|
'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', |
|
'challenge', 'solve the following', 'verify you are human' |
|
] |
|
html_lower = html_content.lower() |
|
return any(pattern in html_lower for pattern in captcha_patterns) |
|
|
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def google_drive_upload(file_path, credentials, folder_id=None): |
|
try: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) |
|
file_metadata = {'name': os.path.basename(file_path)} |
|
if folder_id: |
|
file_metadata['parents'] = [folder_id] |
|
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) |
|
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() |
|
return created.get("id", "") |
|
except Exception as e: |
|
return f"Error uploading to Drive: {str(e)}" |
|
|
|
def create_drive_folder(drive_service, name): |
|
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} |
|
folder = drive_service.files().create(body=folder_metadata, fields='id').execute() |
|
return folder.get('id') |
|
|
|
|
|
def install_playwright_dependencies(): |
|
try: |
|
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
|
|
|
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
|
|
|
|
subprocess.run(['pip', 'install', 'playwright'], check=True) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
|
|
st.success("Playwright dependencies installed successfully!") |
|
except Exception as e: |
|
st.error(f"Error installing Playwright dependencies: {e}") |
|
st.info("You may need to manually install dependencies. Check console for details.") |
|
logger.error(f"Playwright setup error: {e}") |
|
traceback.print_exc() |
|
|
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
self.use_stealth = use_stealth |
|
self.proxy_rotation = proxy_rotation |
|
self.request_count = 0 |
|
self.captcha_detected = False |
|
self.download_timeout = 300 |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
|
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-gpu', |
|
'--no-zygote', |
|
'--single-process', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins', |
|
'--disable-site-isolation-trials' |
|
] |
|
|
|
|
|
if self.use_stealth: |
|
browser_args.extend([ |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-webgl', |
|
'--disable-webrtc' |
|
]) |
|
|
|
|
|
opts = { |
|
"headless": True, |
|
"args": browser_args |
|
} |
|
|
|
|
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
|
|
|
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"viewport": {"width": 1920, "height": 1080}, |
|
"device_scale_factor": 1, |
|
"has_touch": False, |
|
"is_mobile": False, |
|
"ignore_https_errors": True, |
|
"accept_downloads": True |
|
} |
|
|
|
|
|
if self.use_stealth: |
|
|
|
context_opts["bypass_csp"] = True |
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
await self.context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change navigator properties |
|
const newProto = navigator.__proto__; |
|
delete newProto.webdriver; |
|
|
|
// Overwrite the plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages more naturally |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
|
|
// Modify deviceMemory |
|
Object.defineProperty(navigator, 'deviceMemory', { |
|
get: () => 8 |
|
}); |
|
|
|
// WebGL modifications |
|
const getParameter = WebGLRenderingContext.prototype.getParameter; |
|
WebGLRenderingContext.prototype.getParameter = function(parameter) { |
|
if (parameter === 37445) { |
|
return 'Intel Inc.'; |
|
} |
|
if (parameter === 37446) { |
|
return 'Intel Iris OpenGL Engine'; |
|
} |
|
return getParameter.apply(this, arguments); |
|
}; |
|
} |
|
""") |
|
else: |
|
|
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
'Cache-Control': 'max-age=0', |
|
'DNT': '1', |
|
'Referer': 'https://www.google.com/', |
|
'Sec-Fetch-Dest': 'document', |
|
'Sec-Fetch-Mode': 'navigate', |
|
'Sec-Fetch-Site': 'cross-site', |
|
'Sec-Fetch-User': '?1', |
|
'Upgrade-Insecure-Requests': '1' |
|
}) |
|
|
|
|
|
if self.use_stealth: |
|
await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) |
|
await self.page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def rotate_proxy_if_needed(self): |
|
"""Rotate proxy if proxy rotation is enabled and threshold is reached""" |
|
if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: |
|
self.request_count += 1 |
|
if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: |
|
|
|
next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) |
|
PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) |
|
|
|
|
|
if self.context: |
|
await self.context.close() |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"proxy": {"server": next_proxy}, |
|
"accept_downloads": True |
|
} |
|
self.context = await self.browser.new_context(**context_opts) |
|
self.page = await self.context.new_page() |
|
|
|
|
|
self.request_count = 0 |
|
logger.info(f"Rotated to new proxy: {next_proxy}") |
|
|
|
async def handle_captcha(self, page): |
|
"""Detect and handle captchas if possible""" |
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
self.captcha_detected = True |
|
logger.warning("Captcha detected on page") |
|
|
|
|
|
|
|
captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') |
|
if captcha_img: |
|
logger.info("Found captcha image, attempting to capture") |
|
|
|
|
|
captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") |
|
await captcha_img.screenshot(path=captcha_path) |
|
|
|
|
|
|
|
logger.info(f"Captcha image saved to {captcha_path}") |
|
|
|
|
|
return False |
|
|
|
|
|
recaptcha = await page.query_selector('iframe[src*="recaptcha"]') |
|
if recaptcha: |
|
logger.warning("reCAPTCHA detected, would require external solving service") |
|
return False |
|
|
|
|
|
await self.perform_human_actions(page) |
|
|
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
logger.warning("Captcha still present after human-like actions") |
|
return False |
|
else: |
|
logger.info("Captcha appears to be resolved") |
|
return True |
|
|
|
return True |
|
|
|
async def perform_human_actions(self, page): |
|
"""Perform human-like actions on the page to possibly bypass simple bot checks""" |
|
try: |
|
|
|
for i in range(3): |
|
await page.evaluate(f"window.scrollTo(0, {i * 300})") |
|
await page.wait_for_timeout(random.randint(300, 700)) |
|
|
|
|
|
for _ in range(3): |
|
x = random.randint(100, 800) |
|
y = random.randint(100, 600) |
|
await page.mouse.move(x=x, y=y) |
|
await page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
|
|
try: |
|
await page.click("body", position={"x": 50, "y": 50}) |
|
except: |
|
pass |
|
|
|
|
|
await page.wait_for_timeout(1000) |
|
|
|
except Exception as e: |
|
logger.warning(f"Error during human-like actions: {e}") |
|
|
|
async def search_bing(self): |
|
urls = [] |
|
try: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
search_url = f"https://www.bing.com/search?q={self.query}" |
|
await self.page.goto(search_url, timeout=30000) |
|
await self.page.wait_for_load_state('networkidle') |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected during search, results may be limited") |
|
|
|
|
|
for i in range(3): |
|
await self.page.evaluate(f"window.scrollTo(0, {i * 400})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
links = await self.page.query_selector_all("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
|
|
|
|
if len(urls) < self.num_results: |
|
alt_links = await self.page.query_selector_all(".b_caption a") |
|
for link in alt_links: |
|
href = await link.get_attribute('href') |
|
if href and href not in urls: |
|
urls.append(href) |
|
if len(urls) >= self.num_results: |
|
break |
|
|
|
return urls |
|
except Exception as e: |
|
logger.error(f"Error searching Bing: {e}") |
|
return [] |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception as e: |
|
logger.warning(f"Error getting file size: {e}") |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception as e: |
|
logger.warning(f"Error reading PDF metadata: {e}") |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
if response and response.headers.get('location'): |
|
return response.headers['location'] |
|
return page.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
|
|
async def get_edu_exam_links(self, url): |
|
"""Specialized method for educational exam websites that follows a common pattern.""" |
|
try: |
|
logger.info(f"Fetching exam links from {url}") |
|
links = set() |
|
|
|
|
|
headers = { |
|
"User-Agent": get_random_user_agent(), |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
"Referer": "https://www.google.com/", |
|
"DNT": "1" |
|
} |
|
|
|
try: |
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
link_text = a.get_text().lower() |
|
|
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_", |
|
"/resource/", "/subject/", "/course/", "/material/" |
|
] |
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"subject", "course", "resource", "pdf", "document", |
|
"view", "open", "get", "solution", "answer" |
|
] |
|
|
|
|
|
if any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(pattern in link_text for pattern in text_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Request-based extraction failed: {e}") |
|
|
|
|
|
try: |
|
|
|
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url: |
|
logger.info("Using browser for enhanced link extraction") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
await self.page.goto(url, timeout=45000, wait_until='networkidle') |
|
await self.page.wait_for_timeout(random.randint(1000, 2000)) |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, extraction may be limited") |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
page_height = await self.page.evaluate("document.body.scrollHeight") |
|
viewport_height = await self.page.evaluate("window.innerHeight") |
|
|
|
for scroll_pos in range(0, page_height, viewport_height // 2): |
|
await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
await self.page.evaluate("window.scrollTo(0, 0)") |
|
await self.page.wait_for_timeout(500) |
|
|
|
|
|
all_links = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Get all anchor tags |
|
const anchors = document.querySelectorAll('a[href]'); |
|
for (const a of anchors) { |
|
if (a.href) { |
|
results.push({ |
|
href: a.href, |
|
text: a.innerText || a.textContent || '', |
|
isButton: a.classList.contains('btn') || a.role === 'button' |
|
}); |
|
} |
|
} |
|
|
|
// Get buttons that might contain links |
|
const buttons = document.querySelectorAll('button'); |
|
for (const btn of buttons) { |
|
const onclick = btn.getAttribute('onclick') || ''; |
|
if (onclick.includes('window.location') || onclick.includes('download')) { |
|
results.push({ |
|
href: '#button', |
|
text: btn.innerText || btn.textContent || '', |
|
isButton: true, |
|
onclick: onclick |
|
}); |
|
} |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for link_info in all_links: |
|
href = link_info.get('href', '') |
|
text = link_info.get('text', '').lower() |
|
|
|
if href and href != '#button': |
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_" |
|
] |
|
|
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"pdf", "document", "view", "open", "solution" |
|
] |
|
|
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(pattern in text for pattern in text_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
|
|
|
|
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') |
|
for grid in grid_elements: |
|
grid_links = await grid.query_selector_all('a[href]') |
|
for a in grid_links: |
|
href = await a.get_attribute('href') |
|
text = await a.text_content() |
|
|
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
links.add(full_url) |
|
|
|
|
|
pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') |
|
for i, button in enumerate(pagination_buttons[:5]): |
|
try: |
|
|
|
button_text = await button.text_content() |
|
if button_text and button_text.strip().isdigit(): |
|
logger.info(f"Clicking pagination button: {button_text}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_page_links = await self.page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} |
|
""") |
|
|
|
for href in new_page_links: |
|
if href and not href.startswith('javascript:'): |
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination button: {e}") |
|
|
|
|
|
show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') |
|
for button in show_buttons: |
|
button_text = (await button.text_content() or "").lower() |
|
button_value = (await button.get_attribute("value") or "").lower() |
|
button_id = (await button.get_attribute("id") or "").lower() |
|
|
|
|
|
promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", |
|
"download", "resource", "material", "browse", "file"] |
|
|
|
if any(term in button_text or term in button_value or term in button_id |
|
for term in promising_terms): |
|
try: |
|
logger.info(f"Clicking button: {button_text or button_value}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ |
|
any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
try: |
|
|
|
postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') |
|
for i, element in enumerate(postback_elements[:10]): |
|
try: |
|
onclick = await element.get_attribute('onclick') |
|
if onclick and '__doPostBack' in onclick: |
|
element_text = await element.text_content() |
|
|
|
|
|
promising_terms = ["show", "view", "list", "exam", "paper", "test", |
|
"download", "resource", "material"] |
|
|
|
if any(term in element_text.lower() for term in promising_terms): |
|
logger.info(f"Clicking ASP.NET postback element: {element_text}") |
|
|
|
|
|
await element.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with postback element: {e}") |
|
except Exception as e: |
|
logger.warning(f"Error during postback handling: {e}") |
|
|
|
except Exception as e: |
|
logger.error(f"Browser-based extraction failed: {e}") |
|
|
|
|
|
filtered_links = [] |
|
for link in links: |
|
|
|
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if any(pattern in link.lower() for pattern in [ |
|
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", |
|
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", |
|
"/resource/", "/material/", "/notes/", "/subjectmaterial/" |
|
]): |
|
filtered_links.append(link) |
|
|
|
logger.info(f"Found {len(filtered_links)} potential exam document links") |
|
return filtered_links |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting exam links: {e}") |
|
return [] |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized handler for educational exam site") |
|
|
|
|
|
exam_links = await self.get_edu_exam_links(url) |
|
|
|
for link in exam_links: |
|
|
|
real_url = await self.extract_real_download_url(link) |
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/': |
|
domain = get_domain(real_url) |
|
ext = get_file_extension(real_url, '.pdf') |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
meta = {} |
|
if real_url.lower().endswith('.pdf'): |
|
try: |
|
meta = await self.get_pdf_metadata(real_url) |
|
except Exception: |
|
pass |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
if found_files: |
|
return found_files |
|
|
|
|
|
response = await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
if not response: |
|
return [] |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, file extraction may be limited") |
|
|
|
|
|
await self.page.evaluate(""" |
|
(async () => { |
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const scrollStep = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += scrollStep) { |
|
window.scrollTo(0, i); |
|
await delay(100); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
})() |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
final_url = self.page.url |
|
if '.php' in final_url or 'download' in final_url: |
|
real_url = await self.extract_real_download_url(final_url) |
|
if real_url != final_url: |
|
|
|
response = await self.page.request.head(real_url, timeout=15000) |
|
filename = None |
|
|
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
if 'filename=' in content_disposition: |
|
filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) |
|
if filename_match: |
|
filename = filename_match.group(1) |
|
|
|
|
|
if not filename: |
|
filename = os.path.basename(urlparse(real_url).path) |
|
if not filename or filename == '/': |
|
|
|
domain = get_domain(real_url) |
|
ext = get_file_extension(real_url, '.pdf') |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
return found_files |
|
|
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', |
|
'.pptx', '.odt', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
if '.php' in href.lower() or 'download' in href.lower(): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
continue |
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
elif ("drive.google.com" in href) or ("docs.google.com" in href): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
if file_id: |
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
|
|
|
|
filename = f"gdrive_{file_id}" |
|
if file_type: |
|
filename = f"{filename}.{file_type}" |
|
|
|
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': { |
|
'view_only': is_view_only, |
|
'file_type': file_type, |
|
'file_id': file_id |
|
} |
|
}) |
|
|
|
|
|
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) |
|
for elem in other_elements: |
|
src = elem.get('src') or elem.get('data') |
|
if src and any(src.lower().endswith(ext) for ext in all_exts): |
|
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') |
|
for elem in onclick_elements: |
|
onclick = await elem.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url_match in urls: |
|
if any(url_match.lower().endswith(ext) for ext in all_exts): |
|
size_str = await self.get_file_size(url_match) |
|
meta = {} |
|
if url_match.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(url_match) |
|
found_files.append({ |
|
'url': url_match, |
|
'filename': os.path.basename(url_match.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') |
|
for elem in data_elements: |
|
for attr in ['data-src', 'data-url', 'data-href', 'data-download']: |
|
try: |
|
value = await elem.get_attribute(attr) |
|
if value and any(value.lower().endswith(ext) for ext in all_exts): |
|
file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': await self.get_file_size(file_url), |
|
'metadata': {} |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
script_elements = soup.find_all('script', type='application/json') |
|
for script in script_elements: |
|
try: |
|
json_data = json.loads(script.string) |
|
|
|
def extract_urls_from_json(obj, urls_found=None): |
|
if urls_found is None: |
|
urls_found = [] |
|
if isinstance(obj, dict): |
|
for k, v in obj.items(): |
|
|
|
url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] |
|
if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): |
|
urls_found.append(v) |
|
else: |
|
extract_urls_from_json(v, urls_found) |
|
elif isinstance(obj, list): |
|
for item in obj: |
|
extract_urls_from_json(item, urls_found) |
|
return urls_found |
|
|
|
json_urls = extract_urls_from_json(json_data) |
|
for json_url in json_urls: |
|
if any(json_url.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': json_url, |
|
'filename': os.path.basename(json_url.split('?')[0]), |
|
'size': await self.get_file_size(json_url), |
|
'metadata': {} |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
hidden_elements = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Check for hidden forms with download actions |
|
const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); |
|
for (const form of forms) { |
|
const action = form.getAttribute('action') || ''; |
|
results.push({ |
|
type: 'form', |
|
action: action, |
|
inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { |
|
return {name: input.name, value: input.value}; |
|
}) |
|
}); |
|
} |
|
|
|
// Check for hidden download links/buttons |
|
const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { |
|
const style = window.getComputedStyle(a); |
|
return (style.display === 'none' || style.visibility === 'hidden') && |
|
(a.href.includes('download') || a.href.includes('file')); |
|
}); |
|
|
|
for (const link of hiddenLinks) { |
|
results.push({ |
|
type: 'link', |
|
href: link.href, |
|
text: link.innerText || link.textContent |
|
}); |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for elem in hidden_elements: |
|
if elem['type'] == 'link' and 'href' in elem: |
|
href = elem['href'] |
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': href, |
|
'filename': os.path.basename(href.split('?')[0]), |
|
'size': await self.get_file_size(href), |
|
'metadata': {} |
|
}) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
traceback.print_exc() |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
is_view_only = file_info.get('metadata', {}).get('view_only', False) |
|
|
|
|
|
if is_view_only: |
|
logger.info(f"Attempting to download view-only file: {file_url}") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
if result_path: |
|
return result_path |
|
|
|
|
|
logger.info("Primary method failed, trying fallback methods") |
|
|
|
|
|
success = await self.download_from_google_drive(file_url, path) |
|
if success: |
|
return path |
|
|
|
|
|
logger.warning("All standard methods failed, attempting force download") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
return result_path if result_path else None |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
try: |
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer, |
|
'DNT': '1' |
|
} |
|
|
|
with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: |
|
if response.status_code == 200: |
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_url.endswith('.html'): |
|
logger.warning(f"Received HTML instead of expected file: {file_url}") |
|
else: |
|
with open(path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
return path |
|
except Exception as e: |
|
logger.warning(f"Direct download failed: {e}, trying browser approach") |
|
|
|
|
|
async with self.context.new_page() as page: |
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
|
|
|
|
try: |
|
response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) |
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {file_url}") |
|
|
|
|
|
error_info = await response.text() |
|
logger.debug(f"Error response: {error_info[:200]}...") |
|
|
|
|
|
if detect_captcha(error_info): |
|
logger.warning("Captcha detected during download") |
|
|
|
|
|
except PlaywrightTimeoutError: |
|
logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") |
|
|
|
|
|
try: |
|
logger.info("Trying browser download manager approach") |
|
download_promise = page.wait_for_event("download") |
|
await page.goto(file_url, timeout=60000) |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(path) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
return path |
|
except Exception as e: |
|
logger.error(f"Browser download manager approach failed: {e}") |
|
|
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
|
|
async def force_download_viewonly(self, file_info, save_path): |
|
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" |
|
try: |
|
|
|
file_id = file_info.get('metadata', {}).get('file_id') |
|
if not file_id: |
|
url = file_info['url'] |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error("Could not extract file ID") |
|
return None |
|
|
|
file_type = file_info.get('metadata', {}).get('file_type', 'pdf') |
|
base, ext = os.path.splitext(save_path) |
|
if not ext: |
|
save_path = f"{base}.{file_type}" |
|
|
|
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") |
|
|
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-site-isolation-trials', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=browser_args |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
device_scale_factor=2.0, |
|
accept_downloads=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
} |
|
""") |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
content = await page.content() |
|
if "the owner has not granted you permission to" in content: |
|
logger.warning("Permission denied error detected") |
|
|
|
|
|
await page.wait_for_timeout(random.randint(3000, 7000)) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
if file_type.lower() == 'pdf': |
|
|
|
|
|
|
|
await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) |
|
await page.wait_for_timeout(random.randint(500, 1000)) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Method 1: Check page counter text |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// Method 2: Check actual page elements |
|
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pageElements.length > 0) return pageElements.length; |
|
|
|
// Method 3: Look for page thumbnails |
|
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); |
|
if (thumbnails.length > 0) return thumbnails.length; |
|
|
|
// Fallback: conservative guess |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated {estimated_pages} pages in PDF") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Scrolling page by page...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
|
|
while attempt < max_attempts: |
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): |
|
logger.info("All pages appear to be loaded.") |
|
break |
|
|
|
|
|
if attempt % 3 == 0: |
|
await page.keyboard.press("End") |
|
else: |
|
await page.keyboard.press("PageDown") |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1500, 3000)) |
|
|
|
|
|
if attempt % 4 == 0: |
|
await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = Array.from(document.getElementsByTagName("img")) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${imgs.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success', False): |
|
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") |
|
|
|
|
|
logger.info("Trying fallback screenshot method...") |
|
|
|
|
|
await page.evaluate(""" |
|
() => { |
|
// Find and click the "first page" button if available |
|
const buttons = Array.from(document.querySelectorAll('button')); |
|
const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); |
|
if (firstPageBtn) firstPageBtn.click(); |
|
} |
|
""") |
|
await page.wait_for_timeout(1000); |
|
|
|
|
|
screenshots = [] |
|
current_page = 1 |
|
max_pages = estimated_pages |
|
|
|
|
|
while current_page <= max_pages: |
|
screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") |
|
|
|
|
|
page_elem = await page.query_selector('.drive-viewer-paginated-page') |
|
if page_elem: |
|
await page_elem.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
screenshots.append(screenshot_path) |
|
|
|
|
|
next_btn = await page.query_selector('button[aria-label="Next page"]') |
|
if next_btn: |
|
is_disabled = await next_btn.get_attribute('disabled') |
|
if is_disabled: |
|
logger.info(f"Reached end of document at page {current_page}") |
|
break |
|
|
|
await next_btn.click() |
|
await page.wait_for_timeout(1000) |
|
current_page += 1 |
|
else: |
|
break |
|
|
|
|
|
if screenshots: |
|
first_img = Image.open(screenshots[0]) |
|
width, height = first_img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
|
|
return save_path |
|
|
|
return None |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(save_path) |
|
|
|
|
|
try: |
|
os.rmdir(temp_dir) |
|
except: |
|
pass |
|
|
|
else: |
|
|
|
screenshot_path = os.path.join(temp_dir, "file.png") |
|
await page.screenshot(path=screenshot_path) |
|
|
|
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: |
|
|
|
await self.export_google_doc(file_id, file_type, save_path) |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
os.remove(screenshot_path) |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: |
|
logger.info(f"Successfully downloaded file to {save_path}") |
|
return save_path |
|
else: |
|
logger.error(f"Generated file is too small or missing: {save_path}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error during force download: {e}") |
|
if browser: |
|
await browser.close() |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Force download preparation failed: {e}") |
|
return None |
|
|
|
async def download_from_google_drive(self, url, save_path): |
|
"""Enhanced method to download from Google Drive with multiple fallback approaches""" |
|
|
|
file_id = None |
|
url_patterns = [ |
|
r'drive\.google\.com/file/d/([^/]+)', |
|
r'drive\.google\.com/open\?id=([^&]+)', |
|
r'docs\.google\.com/\w+/d/([^/]+)', |
|
r'id=([^&]+)', |
|
r'drive\.google\.com/uc\?id=([^&]+)', |
|
] |
|
|
|
for pattern in url_patterns: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error(f"Could not extract file ID from URL: {url}") |
|
return False |
|
|
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") |
|
|
|
base, ext = os.path.splitext(save_path) |
|
if not ext and file_type: |
|
|
|
save_path = f"{base}.{file_type}" |
|
|
|
|
|
if is_view_only: |
|
|
|
if file_type == 'pdf': |
|
success = await self.download_viewonly_pdf_with_js(file_id, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: |
|
success = await self.export_google_doc(file_id, file_type, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) |
|
if success: |
|
return True |
|
|
|
|
|
try: |
|
|
|
direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" |
|
|
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Referer': 'https://drive.google.com/', |
|
'DNT': '1' |
|
} |
|
|
|
|
|
with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: |
|
if r.status_code == 200: |
|
|
|
content_type = r.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_id.endswith('.html'): |
|
logger.warning("Received HTML instead of file, trying with session cookies") |
|
else: |
|
|
|
with open(save_path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
logger.info("Direct download successful") |
|
return True |
|
|
|
|
|
session = requests.Session() |
|
session.headers.update({'User-Agent': get_random_user_agent()}) |
|
|
|
|
|
session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) |
|
|
|
|
|
url = f"https://drive.google.com/uc?id={file_id}&export=download" |
|
response = session.get(url, stream=True, timeout=30) |
|
|
|
|
|
confirmation_token = None |
|
for k, v in response.cookies.items(): |
|
if k.startswith('download_warning'): |
|
confirmation_token = v |
|
break |
|
|
|
|
|
if confirmation_token: |
|
url = f"{url}&confirm={confirmation_token}" |
|
response = session.get(url, stream=True, timeout=60) |
|
|
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type: |
|
logger.warning("Received HTML instead of file - likely download restriction") |
|
else: |
|
with open(save_path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=1024*1024): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
with open(save_path, 'rb') as f: |
|
content = f.read(100) |
|
if b'<!DOCTYPE html>' not in content: |
|
logger.info("Successfully downloaded with requests session") |
|
return True |
|
except Exception as e: |
|
logger.warning(f"Requests session download failed: {e}") |
|
|
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') |
|
if download_button: |
|
await download_button.click() |
|
|
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except Exception as e: |
|
logger.error(f"Error during browser download: {e}") |
|
return False |
|
else: |
|
|
|
await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) |
|
|
|
|
|
download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') |
|
for elem in download_elements: |
|
try: |
|
await elem.click() |
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except: |
|
pass |
|
except: |
|
continue |
|
except Exception as e: |
|
logger.error(f"Browser-based download attempt failed: {e}") |
|
|
|
logger.warning("All standard download methods failed") |
|
return False |
|
|
|
async def download_viewonly_pdf_with_js(self, file_id, save_path): |
|
"""Download view-only PDF using the enhanced blob image caching technique""" |
|
try: |
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=browser_args |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
accept_downloads=True, |
|
ignore_https_errors=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change plugins and languages to appear more human |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
} |
|
""") |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) |
|
await page.wait_for_timeout(random.randint(2000, 5000)) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Look for page counter in the interface |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// If we can't find a counter, check actual pages |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length > 0) return pages.length; |
|
|
|
// Default to a reasonable number if we can't determine |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated number of pages: {estimated_pages}") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Scrolling through document to load all pages...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
consecutive_same_count = 0 |
|
|
|
while attempt < max_attempts: |
|
|
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
|
|
if blob_count >= estimated_pages: |
|
logger.info(f"All {estimated_pages} pages appear to be loaded.") |
|
break |
|
|
|
if blob_count == prev_blob_count: |
|
consecutive_same_count += 1 |
|
if consecutive_same_count >= 5 and blob_count > 0: |
|
logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") |
|
break |
|
else: |
|
consecutive_same_count = 0 |
|
|
|
|
|
scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) |
|
|
|
if scroll_action == "PageDown": |
|
await page.keyboard.press("PageDown") |
|
elif scroll_action == "End": |
|
await page.keyboard.press("End") |
|
elif scroll_action == "ArrowDown": |
|
|
|
for _ in range(random.randint(5, 15)): |
|
await page.keyboard.press("ArrowDown") |
|
await page.wait_for_timeout(random.randint(50, 150)) |
|
else: |
|
|
|
current_y = random.randint(300, 700) |
|
await page.mouse.move(x=random.randint(300, 800), y=current_y) |
|
await page.mouse.wheel(0, random.randint(300, 800)) |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1000, 3000)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = document.getElementsByTagName("img"); |
|
let validImages = []; |
|
|
|
// First collect all valid blob images |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
if (!/^blob:/.test(img.src)) continue; |
|
if (img.width < 100 || img.height < 100) continue; |
|
validImages.push(img); |
|
} |
|
|
|
// Sort by position in the document |
|
validImages.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${validImages.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
// Process each image as a page |
|
for (let i = 0; i < validImages.length; i++) { |
|
let img = validImages[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
// Use a reliable CDN |
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success'): |
|
logger.error(f"Error in PDF generation: {result.get('error')}") |
|
return False |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
|
|
|
|
await download.save_as(save_path) |
|
logger.info(f"Successfully saved PDF to {save_path}") |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 |
|
|
|
finally: |
|
await browser.close() |
|
|
|
except Exception as e: |
|
logger.error(f"Error in viewonly PDF download process: {e}") |
|
return False |
|
|
|
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): |
|
"""Download any view-only file by taking screenshots""" |
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.set_viewport_size({"width": 1600, "height": 1200}) |
|
|
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) |
|
|
|
|
|
await page.wait_for_load_state('networkidle') |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
base_dir = os.path.dirname(save_path) |
|
base_name = os.path.splitext(os.path.basename(save_path))[0] |
|
screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") |
|
os.makedirs(screenshots_dir, exist_ok=True) |
|
|
|
|
|
is_multi_page = await page.evaluate(""" |
|
() => { |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
return pages.length > 1; |
|
} |
|
""") |
|
|
|
if is_multi_page and file_type == 'pdf': |
|
|
|
page_count = await page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
const container = document.querySelector('.drive-viewer-paginated-scrollable'); |
|
|
|
if (!container || pages.length === 0) return 0; |
|
|
|
// Scroll through to make sure all pages are loaded |
|
const scrollHeight = container.scrollHeight; |
|
const viewportHeight = container.clientHeight; |
|
const scrollStep = viewportHeight; |
|
|
|
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { |
|
container.scrollTo(0, scrollPos); |
|
await delay(300); |
|
} |
|
|
|
// Scroll back to top |
|
container.scrollTo(0, 0); |
|
await delay(300); |
|
|
|
return pages.length; |
|
} |
|
""") |
|
|
|
logger.info(f"Found {page_count} pages in document") |
|
|
|
|
|
screenshots = [] |
|
for i in range(page_count): |
|
|
|
await page.evaluate(f""" |
|
async () => {{ |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length <= {i}) return false; |
|
|
|
pages[{i}].scrollIntoView(); |
|
await delay(500); |
|
return true; |
|
}} |
|
""") |
|
|
|
|
|
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") |
|
await page.screenshot(path=screenshot_path, clip={ |
|
'x': 0, |
|
'y': 0, |
|
'width': 1600, |
|
'height': 1200 |
|
}) |
|
screenshots.append(screenshot_path) |
|
|
|
|
|
c = canvas.Canvas(save_path) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
width, height = img.size |
|
|
|
|
|
c.setPageSize((width, height)) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
|
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
|
|
screenshot_path = os.path.join(screenshots_dir, "screenshot.png") |
|
await page.screenshot(path=screenshot_path, fullPage=True) |
|
|
|
|
|
if file_type == 'pdf': |
|
|
|
img = Image.open(screenshot_path) |
|
width, height = img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
c.drawImage(screenshot_path, 0, 0, width, height) |
|
c.save() |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
|
|
os.remove(screenshot_path) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
|
|
except Exception as e: |
|
logger.error(f"Error taking screenshots: {e}") |
|
return False |
|
|
|
async def export_google_doc(self, file_id, file_type, save_path): |
|
"""Export Google Docs/Sheets/Slides to downloadable formats""" |
|
try: |
|
|
|
export_formats = { |
|
'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pdf': 'application/pdf', |
|
} |
|
|
|
export_format = export_formats.get(file_type, 'application/pdf') |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" |
|
|
|
if 'sheet' in file_type or 'xlsx' in file_type: |
|
export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" |
|
elif 'ppt' in file_type or 'presentation' in file_type: |
|
export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" |
|
elif file_type == 'pdf': |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') |
|
|
|
|
|
response = await page.goto(export_url, wait_until='networkidle') |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(save_path, 'wb') as f: |
|
f.write(content) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
logger.warning(f"Export failed with status {response.status}") |
|
return False |
|
|
|
except Exception as e: |
|
logger.error(f"Error exporting Google Doc: {e}") |
|
return False |
|
|
|
async def get_google_drive_file_info(self, file_id): |
|
"""Get file type and view-only status from Google Drive""" |
|
file_type = None |
|
is_view_only = False |
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
|
|
|
|
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') |
|
is_view_only = view_only_text is not None |
|
|
|
|
|
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') |
|
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') |
|
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') |
|
|
|
if gdocs_viewer: |
|
file_type = 'docx' |
|
elif gsheets_viewer: |
|
file_type = 'xlsx' |
|
elif gslides_viewer: |
|
file_type = 'pptx' |
|
else: |
|
|
|
pdf_viewer = await page.query_selector('embed[type="application/pdf"]') |
|
if pdf_viewer: |
|
file_type = 'pdf' |
|
else: |
|
|
|
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') |
|
if img_viewer: |
|
|
|
img_src = await img_viewer.get_attribute('src') |
|
if 'jpg' in img_src or 'jpeg' in img_src: |
|
file_type = 'jpg' |
|
elif 'png' in img_src: |
|
file_type = 'png' |
|
else: |
|
file_type = 'jpg' |
|
else: |
|
|
|
file_type = 'pdf' |
|
|
|
|
|
if not file_type: |
|
title_element = await page.query_selector('div[role="heading"]') |
|
if title_element: |
|
title = await title_element.text_content() |
|
if title: |
|
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) |
|
if ext_match: |
|
file_type = ext_match.group(1).lower() |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting Google Drive file info: {e}") |
|
file_type = 'pdf' |
|
|
|
return file_type, is_view_only |
|
|
|
|
|
async def get_sublinks(self, url, limit=10000): |
|
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" |
|
links = set() |
|
try: |
|
logger.info(f"Fetching sublinks from: {url}") |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized exam site sublink extraction") |
|
edu_links = await self.get_edu_exam_links(url) |
|
for link in edu_links: |
|
links.add(link) |
|
|
|
|
|
if len(links) > 5: |
|
logger.info(f"Found {len(links)} sublinks with specialized method") |
|
return list(links)[:limit] |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
await self.page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const step = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += step) { |
|
window.scrollTo(0, i); |
|
await delay(150); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
} |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
|
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
|
|
if is_aspnet: |
|
logger.info("Detected ASP.NET page, using enhanced extraction method") |
|
|
|
|
|
|
|
dropdowns = await self.page.query_selector_all('select') |
|
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') |
|
|
|
|
|
for dropdown in dropdowns: |
|
try: |
|
|
|
options = await self.page.evaluate(''' |
|
(dropdown) => { |
|
return Array.from(dropdown.options).map(o => o.value); |
|
} |
|
''', dropdown) |
|
|
|
|
|
for option in options: |
|
if option: |
|
await dropdown.select_option(value=option) |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with dropdown: {e}") |
|
|
|
|
|
safe_buttons = [] |
|
for button in buttons: |
|
button_text = await button.text_content() or "" |
|
button_value = await button.get_attribute("value") or "" |
|
button_id = await button.get_attribute("id") or "" |
|
combined_text = (button_text + button_value + button_id).lower() |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): |
|
continue |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): |
|
safe_buttons.append(button) |
|
|
|
|
|
for button in safe_buttons[:5]: |
|
try: |
|
await button.click() |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
|
|
|
|
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') |
|
for cell in grid_cells: |
|
try: |
|
href = await cell.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error extracting grid link: {e}") |
|
|
|
|
|
postback_links = await self.page.evaluate(''' |
|
() => { |
|
const results = []; |
|
// Find elements with onclick containing __doPostBack |
|
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); |
|
for (const el of elements) { |
|
// Extract the postback target |
|
const onclick = el.getAttribute('onclick') || ''; |
|
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); |
|
if (match && match[1]) { |
|
// Get the visible text to use as description |
|
const text = el.innerText || el.textContent || 'Link'; |
|
results.push({ |
|
id: match[1], |
|
text: text.trim() |
|
}); |
|
} |
|
} |
|
return results; |
|
} |
|
''') |
|
|
|
|
|
for postback in postback_links[:10]: |
|
try: |
|
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") |
|
await self.page.evaluate(f''' |
|
() => {{ |
|
if (typeof __doPostBack === 'function') {{ |
|
__doPostBack('{postback["id"]}', ''); |
|
}} |
|
}} |
|
''') |
|
await self.page.wait_for_timeout(1500) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error with postback: {e}") |
|
|
|
|
|
pagination_elements = await self.page.query_selector_all( |
|
'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' |
|
) |
|
|
|
|
|
for i in range(min(5, len(pagination_elements))): |
|
try: |
|
|
|
el = pagination_elements[i] |
|
el_text = await el.text_content() or "" |
|
|
|
|
|
if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip(): |
|
logger.info(f"Clicking pagination control: {el_text}") |
|
await el.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination: {e}") |
|
|
|
|
|
hidden_links = await self.page.evaluate(""" |
|
() => { |
|
// Try to execute common JavaScript patterns that reveal hidden content |
|
try { |
|
// Common patterns used in websites to initially hide content |
|
const hiddenContainers = document.querySelectorAll( |
|
'.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' |
|
); |
|
|
|
// Attempt to make them visible |
|
hiddenContainers.forEach(el => { |
|
el.style.display = 'block'; |
|
el.style.visibility = 'visible'; |
|
el.classList.remove('hidden', 'hide'); |
|
}); |
|
|
|
// Return any newly visible links |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} catch (e) { |
|
return []; |
|
} |
|
} |
|
""") |
|
|
|
|
|
for href in hidden_links: |
|
if href and not href.startswith('javascript:'): |
|
links.add(href) |
|
|
|
logger.info(f"Found {len(links)} sublinks") |
|
return list(links)[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks from {url}: {e}") |
|
return list(links)[:limit] |
|
|
|
async def extract_all_link_types(self, links_set, base_url, path_base): |
|
"""Extract all types of links from the current page""" |
|
|
|
a_links = await self.page.query_selector_all('a[href]') |
|
for a in a_links: |
|
try: |
|
href = await a.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
iframes = await self.page.query_selector_all('iframe[src]') |
|
for iframe in iframes: |
|
try: |
|
src = await iframe.get_attribute('src') |
|
if src and not src.startswith('javascript:') and not src.startswith('about:'): |
|
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') |
|
for el in onclick_elements: |
|
try: |
|
onclick = await el.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
|
|
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') |
|
for el in data_elements: |
|
for attr in ['data-url', 'data-href', 'data-src']: |
|
try: |
|
value = await el.get_attribute(attr) |
|
if value and not value.startswith('javascript:'): |
|
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') |
|
for anchor in special_anchors: |
|
try: |
|
href = await anchor.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') |
|
for script in script_elements: |
|
try: |
|
script_content = await script.text_content() |
|
if script_content: |
|
|
|
urls = re.findall(r'(https?://[^\'"]+)', script_content) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
def resolve_relative_url(self, relative_url, base_url, path_base): |
|
"""Properly resolve relative URLs considering multiple formats""" |
|
if relative_url.startswith('/'): |
|
|
|
return f"{base_url}{relative_url}" |
|
elif relative_url.startswith('./'): |
|
|
|
return f"{base_url}{path_base}/{relative_url[2:]}" |
|
elif relative_url.startswith('../'): |
|
|
|
parent_path = '/'.join(path_base.split('/')[:-1]) |
|
return f"{base_url}{parent_path}/{relative_url[3:]}" |
|
else: |
|
|
|
return f"{base_url}{path_base}/{relative_url}" |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
progress_text.text("Analyzing main page...") |
|
|
|
is_aspnet = False |
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
except Exception: |
|
pass |
|
|
|
|
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
|
|
all_files = main_files |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return all_files |
|
|
|
|
|
for i, sublink in enumerate(sublinks, 1): |
|
progress = i / total_links |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
try: |
|
|
|
sub_timeout = timeout * 2 if is_aspnet else timeout |
|
|
|
|
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
except Exception as e: |
|
logger.warning(f"Error processing sublink {sublink}: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
final_count = len(unique_files) |
|
progress_text.text(f"Deep search complete!") |
|
file_count_text.text(f"Found {final_count} unique files") |
|
progress_bar.progress(1.0) |
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
await asyncio.sleep(2) |
|
if not st.session_state.get('keep_progress', False): |
|
progress_text.empty() |
|
progress_bar.empty() |
|
|
|
|
|
def main(): |
|
st.title("Advanced File Downloader") |
|
|
|
|
|
if "playwright_installed" not in st.session_state: |
|
with st.spinner("Setting up browser automation. This may take a minute..."): |
|
install_playwright_dependencies() |
|
st.session_state.playwright_installed = True |
|
|
|
if "initialized" not in st.session_state: |
|
st.session_state.initialized = True |
|
st.session_state.discovered_files = [] |
|
st.session_state.current_url = None |
|
st.session_state.google_creds = None |
|
st.session_state.selected_files = [] |
|
st.session_state.do_deep_search = False |
|
st.session_state.deep_search_url = None |
|
st.session_state.search_results = [] |
|
|
|
with st.sidebar: |
|
mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select") |
|
with st.expander("Advanced Options", expanded=True): |
|
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt") |
|
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page") |
|
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink") |
|
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") |
|
use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox") |
|
|
|
with st.expander("Google Drive Integration", expanded=False): |
|
if st.button("Start Google Sign-In", key="google_signin_btn"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
auth_code = st.text_input("Enter authorization code", key="auth_code_input") |
|
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.google_creds = creds |
|
st.write(msg) |
|
|
|
with st.expander("Advanced Browser Settings", expanded=False): |
|
|
|
st.write("**Captcha Handling**") |
|
captcha_option = st.radio( |
|
"Captcha Detection:", |
|
["Auto-detect only", "Manual solve (shows captcha)"], |
|
index=0, |
|
key="captcha_option" |
|
) |
|
|
|
|
|
st.write("**Proxy Rotation**") |
|
enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation") |
|
if enable_rotation: |
|
PROXY_ROTATION_CONFIG["enabled"] = True |
|
proxy_list = st.text_area( |
|
"Proxy List (one per line)", |
|
placeholder="http://proxy1:port\nhttp://proxy2:port", |
|
key="proxy_list" |
|
) |
|
if proxy_list: |
|
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()] |
|
rotation_interval = st.slider( |
|
"Rotation Interval (# of requests)", |
|
min_value=1, |
|
max_value=50, |
|
value=10, |
|
key="rotation_interval" |
|
) |
|
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval |
|
|
|
if mode == "Manual URL": |
|
st.header("Manual URL Mode") |
|
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") |
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): |
|
if url: |
|
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] |
|
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] |
|
if custom_ext_list != valid_ext_list: |
|
st.warning("Invalid extensions ignored. Use format like '.csv'.") |
|
|
|
@st.cache_resource |
|
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): |
|
async def _run(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy_val, |
|
proxy=proxy_val, |
|
use_stealth=use_stealth_val |
|
) as dm: |
|
files = await dm.deep_search(url, ext_list, max_links, timeout_val) |
|
return files |
|
return asyncio.run(_run()) |
|
|
|
with st.spinner("Searching for files..."): |
|
files = run_deep_search(url, valid_ext_list, max_sublinks, |
|
sublink_timeout, use_proxy, proxy, use_stealth) |
|
|
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
|
|
if st.session_state.discovered_files: |
|
files = st.session_state.discovered_files |
|
col1, col2 = st.columns([1, 4]) |
|
with col1: |
|
if st.button("Select All", key="select_all_btn"): |
|
st.session_state.selected_files = list(range(len(files))) |
|
if st.button("Clear Selection", key="clear_selection_btn"): |
|
st.session_state.selected_files = [] |
|
|
|
|
|
file_options = [] |
|
for i, file in enumerate(files): |
|
filename = file['filename'] |
|
size = file['size'] |
|
meta = file.get('metadata', {}) |
|
|
|
|
|
if meta and 'Pages' in meta: |
|
file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages" |
|
else: |
|
file_info = f"{filename} ({size})" |
|
|
|
file_options.append((i, file_info)) |
|
|
|
selected_indices = st.multiselect( |
|
"Select files to download", |
|
options=[i for i, _ in file_options], |
|
default=st.session_state.selected_files, |
|
format_func=lambda i: next(info for idx, info in file_options if idx == i), |
|
key="file_multiselect" |
|
) |
|
|
|
st.session_state.selected_files = selected_indices |
|
|
|
if selected_indices: |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") |
|
with col2: |
|
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") |
|
with col3: |
|
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") |
|
with col4: |
|
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") |
|
|
|
if st.button("Download Selected", key="download_btn"): |
|
if not os.path.exists(download_dir): |
|
os.makedirs(download_dir) |
|
|
|
async def download_files(): |
|
downloaded_paths = [] |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
async with DownloadManager( |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
use_stealth=use_stealth |
|
) as dm: |
|
for i, idx in enumerate(selected_indices): |
|
progress = (i + 1) / len(selected_indices) |
|
file_info = files[idx] |
|
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})") |
|
progress_bar.progress(progress) |
|
|
|
path = await dm.download_file(file_info, download_dir, url) |
|
if path: |
|
downloaded_paths.append(path) |
|
|
|
status_text.empty() |
|
progress_bar.empty() |
|
return downloaded_paths |
|
|
|
with st.spinner("Downloading files..."): |
|
downloaded = asyncio.run(download_files()) |
|
|
|
if downloaded: |
|
st.success(f"Successfully downloaded {len(downloaded)} files") |
|
|
|
if create_zip: |
|
zip_path = create_zip_file(downloaded, download_dir) |
|
st.success(f"Created ZIP file: {zip_path}") |
|
|
|
|
|
with open(zip_path, "rb") as f: |
|
zip_data = f.read() |
|
|
|
st.download_button( |
|
label="Download ZIP", |
|
data=zip_data, |
|
file_name=os.path.basename(zip_path), |
|
mime="application/zip", |
|
key="download_zip_btn" |
|
) |
|
|
|
|
|
if upload_to_drive and st.session_state.google_creds: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds) |
|
folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}") |
|
drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id) |
|
if not isinstance(drive_id, str) or not drive_id.startswith("Error"): |
|
st.success(f"Uploaded to Google Drive. File ID: {drive_id}") |
|
else: |
|
st.error(drive_id) |
|
|
|
|
|
if delete_after: |
|
for path in downloaded: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
st.warning(f"Could not delete {path}: {e}") |
|
st.info("Deleted original files after ZIP creation") |
|
else: |
|
|
|
st.write("Download files individually:") |
|
for path in downloaded: |
|
with open(path, "rb") as f: |
|
file_data = f.read() |
|
|
|
file_name = os.path.basename(path) |
|
mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream" |
|
|
|
st.download_button( |
|
label=f"Download {file_name}", |
|
data=file_data, |
|
file_name=file_name, |
|
mime=mime_type, |
|
key=f"download_file_{path}" |
|
) |
|
|
|
elif mode == "Bing Search": |
|
st.header("Bing Search Mode") |
|
query = st.text_input("Enter search query", key="search_query_input") |
|
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") |
|
|
|
if st.button("Search", key="search_btn"): |
|
if query: |
|
async def run_search(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
query=query, |
|
num_results=num_results, |
|
use_stealth=use_stealth |
|
) as dm: |
|
with st.spinner("Searching..."): |
|
urls = await dm.search_bing() |
|
if urls: |
|
st.session_state.search_results = urls |
|
st.success(f"Found {len(urls)} results!") |
|
|
|
|
|
for i, url in enumerate(urls, 1): |
|
with st.expander(f"Result {i}: {url}", expanded=(i == 1)): |
|
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"): |
|
st.session_state.deep_search_url = url |
|
st.session_state.do_deep_search = True |
|
else: |
|
st.warning("No search results found.") |
|
|
|
asyncio.run(run_search()) |
|
|
|
|
|
if st.session_state.do_deep_search and st.session_state.deep_search_url: |
|
url = st.session_state.deep_search_url |
|
st.info(f"Deep searching: {url}") |
|
|
|
|
|
st.session_state.do_deep_search = False |
|
|
|
|
|
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] |
|
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] |
|
|
|
@st.cache_resource |
|
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val): |
|
async def _run(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy_val, |
|
proxy=proxy_val, |
|
use_stealth=use_stealth_val |
|
) as dm: |
|
files = await dm.deep_search(url, ext_list, max_links, timeout_val) |
|
return files |
|
return asyncio.run(_run()) |
|
|
|
with st.spinner("Searching for files..."): |
|
files = run_deep_search(url, valid_ext_list, max_sublinks, |
|
sublink_timeout, use_proxy, proxy, use_stealth) |
|
|
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
|
|
|
|
st.markdown("---") |
|
with st.expander("Download View-Only Google Drive Document", expanded=False): |
|
st.write("Download protected/view-only Google Drive documents - just enter the file ID") |
|
file_id = st.text_input("Google Drive File ID", |
|
placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku", |
|
help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')") |
|
|
|
if st.button("Download Document") and file_id: |
|
download_dir = "./downloads" |
|
os.makedirs(download_dir, exist_ok=True) |
|
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") |
|
|
|
with st.spinner("Downloading view-only document... (this may take a minute)"): |
|
async def download_viewonly(): |
|
async with DownloadManager(use_stealth=use_stealth) as dm: |
|
file_info = { |
|
'url': f"https://drive.google.com/file/d/{file_id}/view", |
|
'filename': f"gdrive_{file_id}.pdf", |
|
'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True} |
|
} |
|
result_path = await dm.force_download_viewonly(file_info, output_path) |
|
return result_path |
|
|
|
result = asyncio.run(download_viewonly()) |
|
|
|
if result: |
|
st.success("Document downloaded successfully!") |
|
|
|
|
|
with open(result, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
st.download_button( |
|
label="Download PDF", |
|
data=file_bytes, |
|
file_name=f"gdrive_{file_id}.pdf", |
|
mime="application/pdf" |
|
) |
|
else: |
|
st.error("Failed to download the document. Please check the file ID and try again.") |
|
|
|
|
|
st.markdown('---') |
|
st.markdown('Created by [Euler314](https://github.com/euler314)') |
|
|
|
if __name__ == "__main__": |
|
main() |