|
import os |
|
import re |
|
import random |
|
import asyncio |
|
import logging |
|
import traceback |
|
import tempfile |
|
import shutil |
|
import json |
|
import time |
|
from urllib.parse import urlparse, urljoin, unquote, parse_qs |
|
from io import BytesIO |
|
from bs4 import BeautifulSoup |
|
import PyPDF2 |
|
import requests |
|
from PIL import Image |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
|
|
from app.utils import ( |
|
get_random_user_agent, sizeof_fmt, get_domain, is_download_link, |
|
normalize_download_url, detect_captcha, USER_AGENTS, STEALTH_SETTINGS, |
|
PROXY_ROTATION_CONFIG |
|
) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
self.use_stealth = use_stealth |
|
self.proxy_rotation = proxy_rotation |
|
self.request_count = 0 |
|
self.captcha_detected = False |
|
self.download_timeout = 300 |
|
|
|
self.visited_urls = set() |
|
|
|
self.downloaded_files = set() |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
|
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-gpu', |
|
'--no-zygote', |
|
'--single-process', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins', |
|
'--disable-site-isolation-trials' |
|
] |
|
|
|
|
|
if self.use_stealth: |
|
browser_args.extend([ |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-webgl', |
|
'--disable-webrtc' |
|
]) |
|
|
|
|
|
opts = { |
|
"headless": True, |
|
"args": browser_args |
|
} |
|
|
|
|
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
|
|
|
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"viewport": {"width": 1920, "height": 1080}, |
|
"device_scale_factor": 1, |
|
"has_touch": False, |
|
"is_mobile": False, |
|
"ignore_https_errors": True, |
|
"accept_downloads": True |
|
} |
|
|
|
|
|
if self.use_stealth: |
|
|
|
context_opts["bypass_csp"] = True |
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
await self.context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change navigator properties |
|
const newProto = navigator.__proto__; |
|
delete newProto.webdriver; |
|
|
|
// Overwrite the plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages more naturally |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
|
|
// Modify deviceMemory |
|
Object.defineProperty(navigator, 'deviceMemory', { |
|
get: () => 8 |
|
}); |
|
|
|
// WebGL modifications |
|
const getParameter = WebGLRenderingContext.prototype.getParameter; |
|
WebGLRenderingContext.prototype.getParameter = function(parameter) { |
|
if (parameter === 37445) { |
|
return 'Intel Inc.'; |
|
} |
|
if (parameter === 37446) { |
|
return 'Intel Iris OpenGL Engine'; |
|
} |
|
return getParameter.apply(this, arguments); |
|
}; |
|
} |
|
""") |
|
else: |
|
|
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
'Cache-Control': 'max-age=0', |
|
'DNT': '1', |
|
'Referer': 'https://www.google.com/', |
|
'Sec-Fetch-Dest': 'document', |
|
'Sec-Fetch-Mode': 'navigate', |
|
'Sec-Fetch-Site': 'cross-site', |
|
'Sec-Fetch-User': '?1', |
|
'Upgrade-Insecure-Requests': '1' |
|
}) |
|
|
|
|
|
if self.use_stealth: |
|
await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) |
|
await self.page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def rotate_proxy_if_needed(self): |
|
"""Rotate proxy if proxy rotation is enabled and threshold is reached""" |
|
if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: |
|
self.request_count += 1 |
|
if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: |
|
|
|
next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) |
|
PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) |
|
|
|
|
|
if self.context: |
|
await self.context.close() |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"proxy": {"server": next_proxy}, |
|
"accept_downloads": True |
|
} |
|
self.context = await self.browser.new_context(**context_opts) |
|
self.page = await self.context.new_page() |
|
|
|
|
|
self.request_count = 0 |
|
logger.info(f"Rotated to new proxy: {next_proxy}") |
|
|
|
async def handle_captcha(self, page): |
|
"""Detect and handle captchas if possible""" |
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
self.captcha_detected = True |
|
logger.warning("Captcha detected on page") |
|
|
|
|
|
|
|
captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') |
|
if captcha_img: |
|
logger.info("Found captcha image, attempting to capture") |
|
|
|
|
|
captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") |
|
await captcha_img.screenshot(path=captcha_path) |
|
|
|
|
|
|
|
logger.info(f"Captcha image saved to {captcha_path}") |
|
|
|
|
|
return False |
|
|
|
|
|
recaptcha = await page.query_selector('iframe[src*="recaptcha"]') |
|
if recaptcha: |
|
logger.warning("reCAPTCHA detected, would require external solving service") |
|
return False |
|
|
|
|
|
await self.perform_human_actions(page) |
|
|
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
logger.warning("Captcha still present after human-like actions") |
|
return False |
|
else: |
|
logger.info("Captcha appears to be resolved") |
|
return True |
|
|
|
return True |
|
|
|
async def perform_human_actions(self, page): |
|
"""Perform human-like actions on the page to possibly bypass simple bot checks""" |
|
try: |
|
|
|
for i in range(3): |
|
await page.evaluate(f"window.scrollTo(0, {i * 300})") |
|
await page.wait_for_timeout(random.randint(300, 700)) |
|
|
|
|
|
for _ in range(3): |
|
x = random.randint(100, 800) |
|
y = random.randint(100, 600) |
|
await page.mouse.move(x=x, y=y) |
|
await page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
|
|
try: |
|
await page.click("body", position={"x": 50, "y": 50}) |
|
except: |
|
pass |
|
|
|
|
|
await page.wait_for_timeout(1000) |
|
|
|
except Exception as e: |
|
logger.warning(f"Error during human-like actions: {e}") |
|
|
|
async def search_bing(self): |
|
urls = [] |
|
try: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
search_url = f"https://www.bing.com/search?q={self.query}" |
|
await self.page.goto(search_url, timeout=30000) |
|
await self.page.wait_for_load_state('networkidle') |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected during search, results may be limited") |
|
|
|
|
|
for i in range(3): |
|
await self.page.evaluate(f"window.scrollTo(0, {i * 400})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
links = await self.page.query_selector_all("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
|
|
|
|
if len(urls) < self.num_results: |
|
alt_links = await self.page.query_selector_all(".b_caption a") |
|
for link in alt_links: |
|
href = await link.get_attribute('href') |
|
if href and href not in urls: |
|
urls.append(href) |
|
if len(urls) >= self.num_results: |
|
break |
|
|
|
return urls |
|
except Exception as e: |
|
logger.error(f"Error searching Bing: {e}") |
|
return [] |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: |
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Range': 'bytes=0-0' |
|
} |
|
|
|
try: |
|
with requests.get(url, headers=headers, stream=True, timeout=10) as r: |
|
if 'Content-Range' in r.headers: |
|
content_range = r.headers['Content-Range'] |
|
match = re.search(r'bytes 0-0/(\d+)', content_range) |
|
if match: |
|
size = int(match.group(1)) |
|
return sizeof_fmt(size) |
|
|
|
if 'Content-Length' in r.headers: |
|
size = int(r.headers['Content-Length']) |
|
|
|
if size > 1: |
|
return sizeof_fmt(size) |
|
except Exception as e: |
|
logger.warning(f"Error getting file size with Range request: {e}") |
|
|
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
except Exception as e: |
|
logger.warning(f"Error getting file size with browser: {e}") |
|
|
|
return "Unknown Size" |
|
else: |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception as e: |
|
logger.warning(f"Error getting file size: {e}") |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PyPDF2.PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception as e: |
|
logger.warning(f"Error reading PDF metadata: {e}") |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
"""Enhanced method to extract real download URL, handling complex URLs""" |
|
try: |
|
|
|
if 'Action=downloadfile' in url or 'fname=' in url: |
|
logger.info(f"Complex download URL detected: {url}") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.route('**', lambda route: route.continue_()) |
|
|
|
|
|
responses = [] |
|
page.on('response', lambda response: responses.append(response)) |
|
|
|
try: |
|
|
|
await page.goto(url, wait_until='networkidle', timeout=30000) |
|
|
|
|
|
for response in responses: |
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
if 'attachment' in content_disposition or 'filename=' in content_disposition: |
|
return response.url |
|
|
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if content_type and content_type != 'text/html' and not content_type.startswith('text/'): |
|
return response.url |
|
|
|
|
|
return page.url |
|
except Exception as e: |
|
logger.warning(f"Error extracting real download URL: {e}") |
|
return url |
|
else: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
if response and response.headers.get('location'): |
|
return response.headers['location'] |
|
return page.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
|
|
async def get_edu_exam_links(self, url): |
|
"""Specialized method for educational exam websites that follows a common pattern.""" |
|
try: |
|
logger.info(f"Fetching exam links from {url}") |
|
links = set() |
|
|
|
|
|
headers = { |
|
"User-Agent": get_random_user_agent(), |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
"Referer": "https://www.google.com/", |
|
"DNT": "1" |
|
} |
|
|
|
try: |
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
link_text = a.get_text().lower() |
|
|
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_", |
|
"/resource/", "/subject/", "/course/", "/material/" |
|
] |
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"subject", "course", "resource", "pdf", "document", |
|
"view", "open", "get", "solution", "answer" |
|
] |
|
|
|
|
|
if any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(pattern in link_text for pattern in text_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
|
|
|
|
if "Action=downloadfile" in url or "fname=" in url: |
|
links.add(url) |
|
except Exception as e: |
|
logger.warning(f"Request-based extraction failed: {e}") |
|
|
|
|
|
try: |
|
|
|
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: |
|
logger.info("Using browser for enhanced link extraction") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
await self.page.goto(url, timeout=45000, wait_until='networkidle') |
|
await self.page.wait_for_timeout(random.randint(1000, 2000)) |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, extraction may be limited") |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
page_height = await self.page.evaluate("document.body.scrollHeight") |
|
viewport_height = await self.page.evaluate("window.innerHeight") |
|
|
|
for scroll_pos in range(0, page_height, viewport_height // 2): |
|
await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
await self.page.evaluate("window.scrollTo(0, 0)") |
|
await self.page.wait_for_timeout(500) |
|
|
|
|
|
all_links = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Get all anchor tags |
|
const anchors = document.querySelectorAll('a[href]'); |
|
for (const a of anchors) { |
|
if (a.href) { |
|
results.push({ |
|
href: a.href, |
|
text: a.innerText || a.textContent || '', |
|
isButton: a.classList.contains('btn') || a.role === 'button' |
|
}); |
|
} |
|
} |
|
|
|
// Get buttons that might contain links |
|
const buttons = document.querySelectorAll('button'); |
|
for (const btn of buttons) { |
|
const onclick = btn.getAttribute('onclick') || ''; |
|
if (onclick.includes('window.location') || onclick.includes('download')) { |
|
results.push({ |
|
href: '#button', |
|
text: btn.innerText || btn.textContent || '', |
|
isButton: true, |
|
onclick: onclick |
|
}); |
|
} |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for link_info in all_links: |
|
href = link_info.get('href', '') |
|
text = link_info.get('text', '').lower() |
|
|
|
if href and href != '#button': |
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_" |
|
] |
|
|
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"pdf", "document", "view", "open", "solution" |
|
] |
|
|
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(pattern in text for pattern in text_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
|
|
|
|
download_links = await self.page.evaluate(""" |
|
() => { |
|
// Find all links that might be download links |
|
const links = Array.from(document.querySelectorAll('a[href]')); |
|
return links |
|
.filter(a => { |
|
const href = a.href.toLowerCase(); |
|
return href.includes('download') || |
|
href.includes('getfile') || |
|
href.includes('view.php') || |
|
href.includes('action=downloadfile') || |
|
href.includes('fname='); |
|
}) |
|
.map(a => a.href); |
|
} |
|
""") |
|
|
|
for dl_link in download_links: |
|
links.add(dl_link) |
|
|
|
|
|
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') |
|
for grid in grid_elements: |
|
grid_links = await grid.query_selector_all('a[href]') |
|
for a in grid_links: |
|
href = await a.get_attribute('href') |
|
text = await a.text_content() |
|
|
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
links.add(full_url) |
|
|
|
|
|
pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') |
|
for i, button in enumerate(pagination_buttons[:5]): |
|
try: |
|
|
|
button_text = await button.text_content() |
|
if button_text and button_text.strip().isdigit(): |
|
logger.info(f"Clicking pagination button: {button_text}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_page_links = await self.page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} |
|
""") |
|
|
|
for href in new_page_links: |
|
if href and not href.startswith('javascript:'): |
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination button: {e}") |
|
|
|
|
|
show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') |
|
for button in show_buttons: |
|
button_text = (await button.text_content() or "").lower() |
|
button_value = (await button.get_attribute("value") or "").lower() |
|
button_id = (await button.get_attribute("id") or "").lower() |
|
|
|
|
|
promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", |
|
"download", "resource", "material", "browse", "file"] |
|
|
|
if any(term in button_text or term in button_value or term in button_id |
|
for term in promising_terms): |
|
try: |
|
logger.info(f"Clicking button: {button_text or button_value}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ |
|
any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
try: |
|
|
|
postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') |
|
for i, element in enumerate(postback_elements[:10]): |
|
try: |
|
onclick = await element.get_attribute('onclick') |
|
if onclick and '__doPostBack' in onclick: |
|
element_text = await element.text_content() |
|
|
|
|
|
promising_terms = ["show", "view", "list", "exam", "paper", "test", |
|
"download", "resource", "material"] |
|
|
|
if any(term in element_text.lower() for term in promising_terms): |
|
logger.info(f"Clicking ASP.NET postback element: {element_text}") |
|
|
|
|
|
await element.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with postback element: {e}") |
|
except Exception as e: |
|
logger.warning(f"Error during postback handling: {e}") |
|
|
|
except Exception as e: |
|
logger.error(f"Browser-based extraction failed: {e}") |
|
|
|
|
|
filtered_links = [] |
|
for link in links: |
|
|
|
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if any(pattern in link.lower() for pattern in [ |
|
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", |
|
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", |
|
"/resource/", "/material/", "/notes/", "/subjectmaterial/" |
|
]): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if is_download_link(link): |
|
filtered_links.append(link) |
|
|
|
logger.info(f"Found {len(filtered_links)} potential exam document links") |
|
return filtered_links |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting exam links: {e}") |
|
return [] |
|
|
|
async def discover_hidden_links(self, page): |
|
"""Discover hidden links that might be in JavaScript, iframes, or dynamic content""" |
|
hidden_links = set() |
|
|
|
|
|
js_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Extract URLs from script tags |
|
const scripts = document.querySelectorAll('script'); |
|
for (const script of scripts) { |
|
const content = script.textContent || ''; |
|
const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; |
|
for (let match of urlMatches) { |
|
links.add(match.replace(/["']/g, '')); |
|
} |
|
} |
|
|
|
// Look for download-related variables in scripts |
|
for (const script of scripts) { |
|
const content = script.textContent || ''; |
|
// Look for common patterns for file URLs in JavaScript |
|
if (content.includes('downloadURL') || content.includes('fileURL') || |
|
content.includes('pdfURL') || content.includes('documentURL')) { |
|
|
|
// Extract potential URLs |
|
const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; |
|
for (let match of potentialUrls) { |
|
const url = match.replace(/["']/g, ''); |
|
// Try to resolve relative URLs |
|
if (url.startsWith('/') || !url.includes('://')) { |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else { |
|
// Handle relative paths more carefully |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Check for links in data attributes |
|
const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); |
|
for (const el of elements) { |
|
for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { |
|
const val = el.getAttribute(attr); |
|
if (val) { |
|
// Try to resolve relative URLs |
|
if (val.startsWith('/')) { |
|
links.add(window.location.origin + val); |
|
} else if (val.startsWith('http')) { |
|
links.add(val); |
|
} else if (!val.startsWith('javascript:') && !val.startsWith('#')) { |
|
// Handle relative paths |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + val); |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Look for URLs in inline event handlers |
|
const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); |
|
for (const el of clickableElements) { |
|
for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { |
|
const val = el.getAttribute(attr); |
|
if (val) { |
|
// Check for JavaScript URLs with window.location |
|
if (val.includes('window.location') || val.includes('document.location')) { |
|
const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); |
|
if (urlMatch && urlMatch[1]) { |
|
const url = urlMatch[1]; |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) { |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} |
|
} |
|
|
|
// Check for direct URLs in attributes |
|
const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; |
|
for (let match of urlMatches) { |
|
links.add(match.replace(/["']/g, '')); |
|
} |
|
|
|
// Check for download.php and similar patterns |
|
if (val.includes('download.php') || val.includes('getfile.php') || |
|
val.includes('Action=downloadfile') || val.includes('viewfile.php')) { |
|
|
|
// Handle both onclick handlers and direct hrefs |
|
let url = ''; |
|
if (attr === 'href') { |
|
url = val; |
|
} else { |
|
// Extract URL from JavaScript |
|
const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); |
|
if (jsUrlMatch) { |
|
url = jsUrlMatch[1]; |
|
} |
|
} |
|
|
|
// Resolve URL if needed |
|
if (url) { |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) { |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Find PHP/ASP file download links |
|
const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); |
|
for (const link of fileLinks) { |
|
links.add(link.href); |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in js_links: |
|
hidden_links.add(link) |
|
|
|
|
|
iframes = await page.query_selector_all('iframe') |
|
for iframe in iframes: |
|
try: |
|
frame = await iframe.content_frame() |
|
if frame: |
|
iframe_links = await frame.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')) |
|
.map(a => a.href) |
|
.filter(href => href.startsWith('http')); |
|
} |
|
""") |
|
for link in iframe_links: |
|
hidden_links.add(link) |
|
except Exception as e: |
|
logger.warning(f"Could not extract links from iframe: {e}") |
|
|
|
|
|
shadow_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Helper function to recursively process shadow roots |
|
function processShadowRoot(root) { |
|
if (!root) return; |
|
|
|
// Get links in this shadow root |
|
const shadowLinks = root.querySelectorAll('a[href]'); |
|
for (const link of shadowLinks) { |
|
if (link.href && link.href.startsWith('http')) { |
|
links.add(link.href); |
|
} |
|
} |
|
|
|
// Process nested shadow roots |
|
const elements = root.querySelectorAll('*'); |
|
for (const el of elements) { |
|
if (el.shadowRoot) { |
|
processShadowRoot(el.shadowRoot); |
|
} |
|
} |
|
} |
|
|
|
// Find all shadow roots in the document |
|
const elements = document.querySelectorAll('*'); |
|
for (const el of elements) { |
|
if (el.shadowRoot) { |
|
processShadowRoot(el.shadowRoot); |
|
} |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in shadow_links: |
|
hidden_links.add(link) |
|
|
|
|
|
form_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Check for form actions that might be download endpoints |
|
const forms = document.querySelectorAll('form'); |
|
for (const form of forms) { |
|
const action = form.action || ''; |
|
if (action && ( |
|
action.includes('download') || |
|
action.includes('getfile') || |
|
action.includes('viewfile') || |
|
action.includes('Action=downloadfile') |
|
)) { |
|
// Collect input values that might be needed for the download |
|
const inputs = {}; |
|
const formInputs = form.querySelectorAll('input[name]'); |
|
for (const input of formInputs) { |
|
inputs[input.name] = input.value; |
|
} |
|
|
|
// Store both the form action and any important inputs |
|
links.add(action); |
|
} |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in form_links: |
|
hidden_links.add(link) |
|
|
|
return hidden_links |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
normalized_url = normalize_download_url(url) |
|
|
|
|
|
if normalized_url in self.visited_urls: |
|
logger.info(f"Skipping already visited URL: {normalized_url}") |
|
return [] |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
if is_download_link(normalized_url): |
|
logger.info(f"Processing potential direct download link: {normalized_url}") |
|
|
|
|
|
real_url = await self.extract_real_download_url(normalized_url) |
|
|
|
|
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): |
|
|
|
params = parse_qs(urlparse(normalized_url).query) |
|
|
|
|
|
for param in ['file', 'filename', 'name', 'fname', 'f']: |
|
if param in params and params[param]: |
|
potential_filename = params[param][0] |
|
if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: |
|
filename = os.path.basename(potential_filename) |
|
break |
|
|
|
|
|
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): |
|
domain = get_domain(real_url) |
|
|
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in normalized_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': {}, |
|
'download_url': normalized_url |
|
}) |
|
|
|
|
|
if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): |
|
return found_files |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized handler for educational exam site") |
|
|
|
|
|
exam_links = await self.get_edu_exam_links(url) |
|
|
|
for link in exam_links: |
|
|
|
real_url = await self.extract_real_download_url(link) |
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/': |
|
domain = get_domain(real_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in link.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
meta = {} |
|
if real_url.lower().endswith('.pdf'): |
|
try: |
|
meta = await self.get_pdf_metadata(real_url) |
|
except Exception: |
|
pass |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': link |
|
}) |
|
|
|
|
|
if found_files: |
|
return found_files |
|
|
|
|
|
response = await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
if not response: |
|
return [] |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, file extraction may be limited") |
|
|
|
|
|
await self.page.evaluate(""" |
|
(async () => { |
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const scrollStep = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += scrollStep) { |
|
window.scrollTo(0, i); |
|
await delay(100); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
})() |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
final_url = self.page.url |
|
if '.php' in final_url or 'download' in final_url: |
|
real_url = await self.extract_real_download_url(final_url) |
|
if real_url != final_url: |
|
|
|
response = await self.page.request.head(real_url, timeout=15000) |
|
filename = None |
|
|
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
if 'filename=' in content_disposition: |
|
filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) |
|
if filename_match: |
|
filename = filename_match.group(1) |
|
|
|
|
|
if not filename: |
|
filename = os.path.basename(urlparse(real_url).path) |
|
if not filename or filename == '/': |
|
|
|
domain = get_domain(real_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in real_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {}, |
|
'download_url': final_url |
|
}) |
|
return found_files |
|
|
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', |
|
'.pptx', '.odt', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {}, |
|
'download_url': full_url |
|
}) |
|
continue |
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': file_url |
|
}) |
|
|
|
|
|
elif ("drive.google.com" in href) or ("docs.google.com" in href): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
if file_id: |
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
|
|
|
|
filename = f"gdrive_{file_id}" |
|
if file_type: |
|
filename = f"{filename}.{file_type}" |
|
|
|
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': { |
|
'view_only': is_view_only, |
|
'file_type': file_type, |
|
'file_id': file_id |
|
}, |
|
'download_url': href |
|
}) |
|
|
|
|
|
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) |
|
for elem in other_elements: |
|
src = elem.get('src') or elem.get('data') |
|
if src and any(src.lower().endswith(ext) for ext in all_exts): |
|
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': file_url |
|
}) |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') |
|
for elem in onclick_elements: |
|
onclick = await elem.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url_match in urls: |
|
if any(url_match.lower().endswith(ext) for ext in all_exts): |
|
size_str = await self.get_file_size(url_match) |
|
meta = {} |
|
if url_match.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(url_match) |
|
found_files.append({ |
|
'url': url_match, |
|
'filename': os.path.basename(url_match.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': url_match |
|
}) |
|
|
|
|
|
data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') |
|
for elem in data_elements: |
|
for attr in ['data-src', 'data-url', 'data-href', 'data-download']: |
|
try: |
|
value = await elem.get_attribute(attr) |
|
if value and any(value.lower().endswith(ext) for ext in all_exts): |
|
file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': await self.get_file_size(file_url), |
|
'metadata': {}, |
|
'download_url': file_url |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
script_elements = soup.find_all('script', type='application/json') |
|
for script in script_elements: |
|
try: |
|
json_data = json.loads(script.string) |
|
|
|
def extract_urls_from_json(obj, urls_found=None): |
|
if urls_found is None: |
|
urls_found = [] |
|
if isinstance(obj, dict): |
|
for k, v in obj.items(): |
|
|
|
url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] |
|
if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): |
|
urls_found.append(v) |
|
else: |
|
extract_urls_from_json(v, urls_found) |
|
elif isinstance(obj, list): |
|
for item in obj: |
|
extract_urls_from_json(item, urls_found) |
|
return urls_found |
|
|
|
json_urls = extract_urls_from_json(json_data) |
|
for json_url in json_urls: |
|
if any(json_url.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': json_url, |
|
'filename': os.path.basename(json_url.split('?')[0]), |
|
'size': await self.get_file_size(json_url), |
|
'metadata': {}, |
|
'download_url': json_url |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
hidden_elements = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Check for hidden forms with download actions |
|
const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); |
|
for (const form of forms) { |
|
const action = form.getAttribute('action') || ''; |
|
results.push({ |
|
type: 'form', |
|
action: action, |
|
inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { |
|
return {name: input.name, value: input.value}; |
|
}) |
|
}); |
|
} |
|
|
|
// Check for hidden download links/buttons |
|
const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { |
|
const style = window.getComputedStyle(a); |
|
return (style.display === 'none' || style.visibility === 'hidden') && |
|
(a.href.includes('download') || a.href.includes('file')); |
|
}); |
|
|
|
for (const link of hiddenLinks) { |
|
results.push({ |
|
type: 'link', |
|
href: link.href, |
|
text: link.innerText || link.textContent |
|
}); |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for elem in hidden_elements: |
|
if elem['type'] == 'link' and 'href' in elem: |
|
href = elem['href'] |
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': href, |
|
'filename': os.path.basename(href.split('?')[0]), |
|
'size': await self.get_file_size(href), |
|
'metadata': {}, |
|
'download_url': href |
|
}) |
|
|
|
|
|
hidden_links = await self.discover_hidden_links(self.page) |
|
for link in hidden_links: |
|
if any(link.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': link, |
|
'filename': os.path.basename(link.split('?')[0]), |
|
'size': await self.get_file_size(link), |
|
'metadata': {}, |
|
'download_url': link |
|
}) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
traceback.print_exc() |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info.get('download_url', file_info['url']) |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
if file_url in self.downloaded_files: |
|
logger.info(f"File already downloaded: {file_url}") |
|
return None |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
is_view_only = file_info.get('metadata', {}).get('view_only', False) |
|
|
|
|
|
if is_view_only: |
|
logger.info(f"Attempting to download view-only file: {file_url}") |
|
result_path = await self._force_download_viewonly(file_info, path) |
|
if result_path: |
|
self.downloaded_files.add(file_url) |
|
return result_path |
|
|
|
|
|
logger.info("Primary method failed, trying fallback methods") |
|
|
|
|
|
success = await self._download_from_google_drive(file_url, path) |
|
if success: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
|
|
|
|
logger.warning("All standard methods failed, attempting force download") |
|
result_path = await self._force_download_viewonly(file_info, path) |
|
if result_path: |
|
self.downloaded_files.add(file_url) |
|
return result_path if result_path else None |
|
|
|
|
|
if 'Action=downloadfile' in file_url or 'fname=' in file_url: |
|
logger.info(f"Using browser download approach for complex URL: {file_url}") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
await page.goto(file_url, timeout=60000) |
|
|
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(path) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.error(f"Browser download failed: {e}") |
|
|
|
|
|
download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') |
|
for button in download_buttons: |
|
try: |
|
await button.click() |
|
try: |
|
download = await download_promise |
|
await download.save_as(path) |
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except: |
|
pass |
|
except: |
|
continue |
|
|
|
|
|
logger.info("Browser approach failed, trying direct request") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
try: |
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer, |
|
'DNT': '1' |
|
} |
|
|
|
with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: |
|
if response.status_code == 200: |
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_url.endswith('.html'): |
|
logger.warning(f"Received HTML instead of expected file: {file_url}") |
|
else: |
|
with open(path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.warning(f"Direct download failed: {e}, trying browser approach") |
|
|
|
|
|
async with self.context.new_page() as page: |
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
|
|
|
|
try: |
|
response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) |
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {file_url}") |
|
|
|
|
|
error_info = await response.text() |
|
logger.debug(f"Error response: {error_info[:200]}...") |
|
|
|
|
|
if detect_captcha(error_info): |
|
logger.warning("Captcha detected during download") |
|
|
|
|
|
except PlaywrightTimeoutError: |
|
logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") |
|
|
|
|
|
try: |
|
logger.info("Trying browser download manager approach") |
|
download_promise = page.wait_for_event("download") |
|
await page.goto(file_url, timeout=60000) |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(path) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.error(f"Browser download manager approach failed: {e}") |
|
|
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
|
|
async def _force_download_viewonly(self, file_info, save_path): |
|
"""Main method to handle view-only files, now simplified""" |
|
|
|
file_id = self._extract_drive_file_id(file_info) |
|
if not file_id: |
|
logger.error("Could not extract file ID") |
|
return None |
|
|
|
|
|
file_type = file_info.get('metadata', {}).get('file_type', 'pdf') |
|
base, ext = os.path.splitext(save_path) |
|
if not ext: |
|
save_path = f"{base}.{file_type}" |
|
|
|
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") |
|
|
|
|
|
browser = await self._create_stealth_browser() |
|
|
|
try: |
|
|
|
page = await browser.new_page() |
|
|
|
|
|
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
content = await page.content() |
|
if "the owner has not granted you permission to" in content: |
|
logger.warning("Permission denied error detected") |
|
return None |
|
|
|
|
|
await page.wait_for_timeout(random.randint(3000, 7000)) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
if file_type.lower() == 'pdf': |
|
return await self._download_viewonly_pdf(page, file_id, save_path, temp_dir) |
|
else: |
|
return await self._download_viewonly_other(page, file_id, file_type, save_path, temp_dir) |
|
|
|
except Exception as e: |
|
logger.error(f"Error during force download: {e}") |
|
return None |
|
finally: |
|
await browser.close() |
|
|
|
def _extract_drive_file_id(self, file_info): |
|
"""Extract Google Drive file ID from file info""" |
|
|
|
file_id = file_info.get('metadata', {}).get('file_id') |
|
if file_id: |
|
return file_id |
|
|
|
|
|
url = file_info.get('url', '') |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, url) |
|
if match: |
|
return match.group(1) |
|
|
|
return None |
|
|
|
async def _create_stealth_browser(self): |
|
"""Create a stealth browser instance for handling sensitive downloads""" |
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-site-isolation-trials', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=browser_args |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
device_scale_factor=2.0, |
|
accept_downloads=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
} |
|
""") |
|
|
|
return browser |
|
|
|
async def _download_viewonly_pdf(self, page, file_id, save_path, temp_dir): |
|
"""Handle downloading view-only PDF files""" |
|
try: |
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Method 1: Check page counter text |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// Method 2: Check actual page elements |
|
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pageElements.length > 0) return pageElements.length; |
|
|
|
// Method 3: Look for page thumbnails |
|
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); |
|
if (thumbnails.length > 0) return thumbnails.length; |
|
|
|
// Fallback: conservative guess |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated {estimated_pages} pages in PDF") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Scrolling page by page...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
|
|
while attempt < max_attempts: |
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): |
|
logger.info("All pages appear to be loaded.") |
|
break |
|
|
|
|
|
if attempt % 3 == 0: |
|
await page.keyboard.press("End") |
|
else: |
|
await page.keyboard.press("PageDown") |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1500, 3000)) |
|
|
|
|
|
if attempt % 4 == 0: |
|
await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = Array.from(document.getElementsByTagName("img")) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${imgs.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success', False): |
|
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") |
|
|
|
|
|
logger.info("Trying fallback screenshot method...") |
|
return await self._pdf_screenshot_fallback(page, estimated_pages, save_path, temp_dir) |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(save_path) |
|
|
|
|
|
try: |
|
os.rmdir(temp_dir) |
|
except: |
|
pass |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: |
|
logger.info(f"Successfully downloaded PDF to {save_path}") |
|
return save_path |
|
else: |
|
logger.error(f"Generated file is too small or missing: {save_path}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error in PDF download: {e}") |
|
return None |
|
|
|
async def _pdf_screenshot_fallback(self, page, estimated_pages, save_path, temp_dir): |
|
"""Fallback method using screenshots for PDF creation""" |
|
try: |
|
|
|
await page.evaluate(""" |
|
() => { |
|
// Find and click the "first page" button if available |
|
const buttons = Array.from(document.querySelectorAll('button')); |
|
const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); |
|
if (firstPageBtn) firstPageBtn.click(); |
|
} |
|
""") |
|
await page.wait_for_timeout(1000); |
|
|
|
|
|
screenshots = [] |
|
current_page = 1 |
|
max_pages = estimated_pages |
|
|
|
|
|
while current_page <= max_pages: |
|
screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") |
|
|
|
|
|
page_elem = await page.query_selector('.drive-viewer-paginated-page') |
|
if page_elem: |
|
await page_elem.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
screenshots.append(screenshot_path) |
|
|
|
|
|
next_btn = await page.query_selector('button[aria-label="Next page"]') |
|
if next_btn: |
|
is_disabled = await next_btn.get_attribute('disabled') |
|
if is_disabled: |
|
logger.info(f"Reached end of document at page {current_page}") |
|
break |
|
|
|
await next_btn.click() |
|
await page.wait_for_timeout(1000) |
|
current_page += 1 |
|
else: |
|
break |
|
|
|
|
|
if screenshots: |
|
first_img = Image.open(screenshots[0]) |
|
width, height = first_img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
|
|
return save_path |
|
|
|
return None |
|
except Exception as e: |
|
logger.error(f"Error in screenshot fallback: {e}") |
|
return None |
|
|
|
async def _download_viewonly_other(self, page, file_id, file_type, save_path, temp_dir): |
|
"""Handle downloading non-PDF view-only files""" |
|
try: |
|
|
|
screenshot_path = os.path.join(temp_dir, "file.png") |
|
await page.screenshot(path=screenshot_path) |
|
|
|
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: |
|
|
|
success = await self._export_google_doc(file_id, file_type, save_path) |
|
if success: |
|
os.remove(screenshot_path) |
|
return save_path |
|
|
|
|
|
logger.warning(f"Export failed, falling back to screenshot for {file_type}") |
|
|
|
|
|
shutil.copy(screenshot_path, save_path) |
|
os.remove(screenshot_path) |
|
|
|
return save_path if os.path.exists(save_path) else None |
|
|
|
except Exception as e: |
|
logger.error(f"Error in non-PDF download: {e}") |
|
return None |
|
|
|
async def _download_from_google_drive(self, url, save_path): |
|
"""Enhanced method to download from Google Drive with multiple fallback approaches""" |
|
|
|
file_id = self._extract_drive_file_id({"url": url}) |
|
if not file_id: |
|
logger.error(f"Could not extract file ID from URL: {url}") |
|
return False |
|
|
|
|
|
file_type, is_view_only = await self._get_google_drive_file_info(file_id) |
|
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") |
|
|
|
base, ext = os.path.splitext(save_path) |
|
if not ext and file_type: |
|
|
|
save_path = f"{base}.{file_type}" |
|
|
|
|
|
if is_view_only: |
|
|
|
if file_type == 'pdf': |
|
success = await self._download_viewonly_pdf_with_js(file_id, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: |
|
success = await self._export_google_doc(file_id, file_type, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
result_path = await self._force_download_viewonly({ |
|
'url': url, |
|
'metadata': {'file_id': file_id, 'file_type': file_type, 'view_only': True} |
|
}, save_path) |
|
|
|
return bool(result_path) |
|
|
|
|
|
try: |
|
|
|
direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" |
|
|
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Referer': 'https://drive.google.com/', |
|
'DNT': '1' |
|
} |
|
|
|
|
|
with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: |
|
if r.status_code == 200: |
|
|
|
content_type = r.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_id.endswith('.html'): |
|
logger.warning("Received HTML instead of file, trying with session cookies") |
|
else: |
|
|
|
with open(save_path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
logger.info("Direct download successful") |
|
return True |
|
|
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') |
|
if download_button: |
|
await download_button.click() |
|
|
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except Exception as e: |
|
logger.error(f"Error during browser download: {e}") |
|
return False |
|
else: |
|
|
|
await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) |
|
|
|
|
|
download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') |
|
for elem in download_elements: |
|
try: |
|
await elem.click() |
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except: |
|
pass |
|
except: |
|
continue |
|
except Exception as e: |
|
logger.error(f"Browser-based download attempt failed: {e}") |
|
|
|
logger.warning("All standard download methods failed") |
|
return False |
|
except Exception as e: |
|
logger.error(f"Error in Google Drive download: {e}") |
|
return False |
|
|
|
async def _download_viewonly_pdf_with_js(self, file_id, save_path): |
|
"""Download view-only PDF using blob images and JS""" |
|
try: |
|
|
|
browser = await self._create_stealth_browser() |
|
page = await browser.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) |
|
await page.wait_for_timeout(random.randint(2000, 5000)) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Look for page counter in the interface |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// If we can't find a counter, check actual pages |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length > 0) return pages.length; |
|
|
|
// Default to a reasonable number if we can't determine |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated number of pages: {estimated_pages}") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
await self._natural_scroll_through_document(page, estimated_pages) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = Array.from(document.getElementsByTagName("img")) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${imgs.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success'): |
|
logger.error(f"Error in PDF generation: {result.get('error')}") |
|
return False |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
|
|
|
|
await download.save_as(save_path) |
|
logger.info(f"Successfully saved PDF to {save_path}") |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 |
|
|
|
finally: |
|
await browser.close() |
|
|
|
except Exception as e: |
|
logger.error(f"Error in viewonly PDF download process: {e}") |
|
return False |
|
|
|
async def _natural_scroll_through_document(self, page, estimated_pages): |
|
"""Scroll through document in a natural way to load all pages""" |
|
logger.info("Scrolling through document to load all pages...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
consecutive_same_count = 0 |
|
|
|
while attempt < max_attempts: |
|
|
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
|
|
if blob_count >= estimated_pages: |
|
logger.info(f"All {estimated_pages} pages appear to be loaded.") |
|
break |
|
|
|
if blob_count == prev_blob_count: |
|
consecutive_same_count += 1 |
|
if consecutive_same_count >= 5 and blob_count > 0: |
|
logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") |
|
break |
|
else: |
|
consecutive_same_count = 0 |
|
|
|
|
|
scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) |
|
|
|
if scroll_action == "PageDown": |
|
await page.keyboard.press("PageDown") |
|
elif scroll_action == "End": |
|
await page.keyboard.press("End") |
|
elif scroll_action == "ArrowDown": |
|
|
|
for _ in range(random.randint(5, 15)): |
|
await page.keyboard.press("ArrowDown") |
|
await page.wait_for_timeout(random.randint(50, 150)) |
|
else: |
|
|
|
current_y = random.randint(300, 700) |
|
await page.mouse.move(x=random.randint(300, 800), y=current_y) |
|
await page.mouse.wheel(0, random.randint(300, 800)) |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1000, 3000)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
async def _export_google_doc(self, file_id, file_type, save_path): |
|
"""Export Google Docs/Sheets/Slides to downloadable formats""" |
|
try: |
|
|
|
export_urls = { |
|
'doc': f"https://docs.google.com/document/d/{file_id}/export?format=doc", |
|
'docx': f"https://docs.google.com/document/d/{file_id}/export?format=docx", |
|
'sheet': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", |
|
'xlsx': f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx", |
|
'ppt': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", |
|
'pptx': f"https://docs.google.com/presentation/d/{file_id}/export/pptx", |
|
'pdf': f"https://docs.google.com/document/d/{file_id}/export?format=pdf" |
|
} |
|
|
|
export_url = export_urls.get(file_type, f"https://docs.google.com/document/d/{file_id}/export?format=pdf") |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') |
|
|
|
|
|
response = await page.goto(export_url, wait_until='networkidle') |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(save_path, 'wb') as f: |
|
f.write(content) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
logger.warning(f"Export failed with status {response.status}") |
|
return False |
|
|
|
except Exception as e: |
|
logger.error(f"Error exporting Google Doc: {e}") |
|
return False |
|
|
|
async def _get_google_drive_file_info(self, file_id): |
|
"""Get file type and view-only status from Google Drive""" |
|
file_type = None |
|
is_view_only = False |
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
|
|
|
|
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') |
|
is_view_only = view_only_text is not None |
|
|
|
|
|
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') |
|
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') |
|
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') |
|
|
|
if gdocs_viewer: |
|
file_type = 'docx' |
|
elif gsheets_viewer: |
|
file_type = 'xlsx' |
|
elif gslides_viewer: |
|
file_type = 'pptx' |
|
else: |
|
|
|
pdf_viewer = await page.query_selector('embed[type="application/pdf"]') |
|
if pdf_viewer: |
|
file_type = 'pdf' |
|
else: |
|
|
|
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') |
|
if img_viewer: |
|
|
|
img_src = await img_viewer.get_attribute('src') |
|
if 'jpg' in img_src or 'jpeg' in img_src: |
|
file_type = 'jpg' |
|
elif 'png' in img_src: |
|
file_type = 'png' |
|
else: |
|
file_type = 'jpg' |
|
else: |
|
|
|
file_type = 'pdf' |
|
|
|
|
|
if not file_type: |
|
title_element = await page.query_selector('div[role="heading"]') |
|
if title_element: |
|
title = await title_element.text_content() |
|
if title: |
|
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) |
|
if ext_match: |
|
file_type = ext_match.group(1).lower() |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting Google Drive file info: {e}") |
|
file_type = 'pdf' |
|
|
|
return file_type, is_view_only |
|
|
|
|
|
async def get_sublinks(self, url, limit=10000): |
|
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" |
|
links = set() |
|
try: |
|
logger.info(f"Fetching sublinks from: {url}") |
|
|
|
|
|
if is_download_link(url): |
|
logger.info(f"URL appears to be a direct download link: {url}") |
|
links.add(url) |
|
return list(links)[:limit] |
|
|
|
|
|
normalized_url = normalize_download_url(url) |
|
if normalized_url in self.visited_urls: |
|
logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") |
|
return list(links)[:limit] |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized exam site sublink extraction") |
|
edu_links = await self.get_edu_exam_links(url) |
|
for link in edu_links: |
|
links.add(link) |
|
|
|
|
|
if len(links) > 5: |
|
logger.info(f"Found {len(links)} sublinks with specialized method") |
|
return list(links)[:limit] |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
except Exception as e: |
|
logger.warning(f"Error navigating to URL for sublink extraction: {e}") |
|
|
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
await self.page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const step = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += step) { |
|
window.scrollTo(0, i); |
|
await delay(150); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
} |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
|
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
|
|
if is_aspnet: |
|
logger.info("Detected ASP.NET page, using enhanced extraction method") |
|
|
|
|
|
|
|
dropdowns = await self.page.query_selector_all('select') |
|
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') |
|
|
|
|
|
for dropdown in dropdowns: |
|
try: |
|
|
|
options = await self.page.evaluate(''' |
|
(dropdown) => { |
|
return Array.from(dropdown.options).map(o => o.value); |
|
} |
|
''', dropdown) |
|
|
|
|
|
for option in options: |
|
if option: |
|
await dropdown.select_option(value=option) |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with dropdown: {e}") |
|
|
|
|
|
safe_buttons = [] |
|
for button in buttons: |
|
button_text = await button.text_content() or "" |
|
button_value = await button.get_attribute("value") or "" |
|
button_id = await button.get_attribute("id") or "" |
|
combined_text = (button_text + button_value + button_id).lower() |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): |
|
continue |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): |
|
safe_buttons.append(button) |
|
|
|
|
|
for button in safe_buttons[:5]: |
|
try: |
|
await button.click() |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
|
|
|
|
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') |
|
for cell in grid_cells: |
|
try: |
|
href = await cell.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error extracting grid link: {e}") |
|
|
|
|
|
postback_links = await self.page.evaluate(''' |
|
() => { |
|
const results = []; |
|
// Find elements with onclick containing __doPostBack |
|
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); |
|
for (const el of elements) { |
|
// Extract the postback target |
|
const onclick = el.getAttribute('onclick') || ''; |
|
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); |
|
if (match && match[1]) { |
|
// Get the visible text to use as description |
|
const text = el.innerText || el.textContent || 'Link'; |
|
results.push({ |
|
id: match[1], |
|
text: text.trim() |
|
}); |
|
} |
|
} |
|
return results; |
|
} |
|
''') |
|
|
|
|
|
for postback in postback_links[:10]: |
|
try: |
|
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") |
|
await self.page.evaluate(f''' |
|
() => {{ |
|
if (typeof __doPostBack === 'function') {{ |
|
__doPostBack('{postback["id"]}', ''); |
|
}} |
|
}} |
|
''') |
|
await self.page.wait_for_timeout(1500) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error with postback: {e}") |
|
|
|
|
|
pagination_elements = await self.page.query_selector_all( |
|
'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' |
|
) |
|
|
|
|
|
for i in range(min(5, len(pagination_elements))): |
|
try: |
|
|
|
el = pagination_elements[i] |
|
el_text = await el.text_content() or "" |
|
|
|
|
|
if "next" in el_text.lower() or ">" == el_text.strip() or "β" == el_text.strip(): |
|
logger.info(f"Clicking pagination control: {el_text}") |
|
await el.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination: {e}") |
|
|
|
|
|
hidden_links = await self.page.evaluate(""" |
|
() => { |
|
// Try to execute common JavaScript patterns that reveal hidden content |
|
try { |
|
// Common patterns used in websites to initially hide content |
|
const hiddenContainers = document.querySelectorAll( |
|
'.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' |
|
); |
|
|
|
// Attempt to make them visible |
|
hiddenContainers.forEach(el => { |
|
el.style.display = 'block'; |
|
el.style.visibility = 'visible'; |
|
el.classList.remove('hidden', 'hide'); |
|
}); |
|
|
|
// Return any newly visible links |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} catch (e) { |
|
return []; |
|
} |
|
} |
|
""") |
|
|
|
|
|
for href in hidden_links: |
|
if href and not href.startswith('javascript:'): |
|
links.add(href) |
|
|
|
|
|
download_links = await self.page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')) |
|
.filter(a => { |
|
const href = a.href.toLowerCase(); |
|
return href.includes('download') || |
|
href.includes('file') || |
|
href.includes('get') || |
|
href.includes('view.php') || |
|
href.includes('action=') || |
|
href.includes('fname='); |
|
}) |
|
.map(a => a.href); |
|
} |
|
""") |
|
|
|
for download_link in download_links: |
|
links.add(download_link) |
|
|
|
|
|
js_links = await self.discover_hidden_links(self.page) |
|
for link in js_links: |
|
links.add(link) |
|
|
|
logger.info(f"Found {len(links)} sublinks") |
|
|
|
|
|
prioritized_links = [] |
|
normal_links = [] |
|
|
|
for link in links: |
|
if is_download_link(link): |
|
prioritized_links.append(link) |
|
else: |
|
normal_links.append(link) |
|
|
|
|
|
result = prioritized_links + normal_links |
|
return result[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks from {url}: {e}") |
|
return list(links)[:limit] |
|
|
|
async def extract_all_link_types(self, links_set, base_url, path_base): |
|
"""Extract all types of links from the current page""" |
|
|
|
a_links = await self.page.query_selector_all('a[href]') |
|
for a in a_links: |
|
try: |
|
href = await a.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
iframes = await self.page.query_selector_all('iframe[src]') |
|
for iframe in iframes: |
|
try: |
|
src = await iframe.get_attribute('src') |
|
if src and not src.startswith('javascript:') and not src.startswith('about:'): |
|
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') |
|
for el in onclick_elements: |
|
try: |
|
onclick = await el.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
|
|
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') |
|
for el in data_elements: |
|
for attr in ['data-url', 'data-href', 'data-src']: |
|
try: |
|
value = await el.get_attribute(attr) |
|
if value and not value.startswith('javascript:'): |
|
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') |
|
for anchor in special_anchors: |
|
try: |
|
href = await anchor.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') |
|
for script in script_elements: |
|
try: |
|
script_content = await script.text_content() |
|
if script_content: |
|
|
|
urls = re.findall(r'(https?://[^\'"]+)', script_content) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
def resolve_relative_url(self, relative_url, base_url, path_base): |
|
"""Properly resolve relative URLs considering multiple formats""" |
|
if relative_url.startswith('/'): |
|
|
|
return f"{base_url}{relative_url}" |
|
elif relative_url.startswith('./'): |
|
|
|
return f"{base_url}{path_base}/{relative_url[2:]}" |
|
elif relative_url.startswith('../'): |
|
|
|
parent_path = '/'.join(path_base.split('/')[:-1]) |
|
return f"{base_url}{parent_path}/{relative_url[3:]}" |
|
else: |
|
|
|
return f"{base_url}{path_base}/{relative_url}" |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): |
|
"""Perform a deep search for files at the URL and its sublinks""" |
|
import streamlit as st |
|
|
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
|
|
self.visited_urls = set() |
|
|
|
progress_text.text("π Analyzing main page...") |
|
|
|
is_aspnet = False |
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
except Exception: |
|
pass |
|
|
|
|
|
if is_download_link(url): |
|
progress_text.text("π₯ URL appears to be a direct download. Processing...") |
|
|
|
|
|
normalized_url = normalize_download_url(url) |
|
file_info = { |
|
'url': normalized_url, |
|
'download_url': normalized_url, |
|
'filename': os.path.basename(urlparse(normalized_url).path) or 'download', |
|
'size': 'Unknown Size', |
|
'metadata': {} |
|
} |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
progress_bar.progress(1.0) |
|
return [file_info] |
|
|
|
|
|
progress_text.text("π Extracting files from main page...") |
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("π Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
|
|
all_files = main_files |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return all_files |
|
|
|
|
|
for i, sublink in enumerate(sublinks, 1): |
|
progress = i / total_links |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
try: |
|
|
|
if is_download_link(sublink): |
|
|
|
normalized_url = normalize_download_url(sublink) |
|
|
|
|
|
if normalized_url in self.visited_urls: |
|
continue |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
size_str = await self.get_file_size(normalized_url) |
|
|
|
|
|
filename = os.path.basename(urlparse(normalized_url).path) |
|
if not filename or filename == '/' or '?' in filename: |
|
domain = get_domain(normalized_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: |
|
if common_ext in normalized_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
all_files.append({ |
|
'url': normalized_url, |
|
'download_url': normalized_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': {} |
|
}) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
continue |
|
|
|
|
|
sub_timeout = timeout * 2 if is_aspnet else timeout |
|
|
|
|
|
if sublink in self.visited_urls: |
|
continue |
|
|
|
|
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
except Exception as e: |
|
logger.warning(f"Error processing sublink {sublink}: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
final_count = len(unique_files) |
|
progress_text.text(f"β
Deep search complete!") |
|
file_count_text.text(f"Found {final_count} unique files") |
|
progress_bar.progress(1.0) |
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"β οΈ Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
await asyncio.sleep(2) |
|
if not st.session_state.get('keep_progress', False): |
|
progress_text.empty() |
|
progress_bar.empty() |