Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import asyncio
|
@@ -21,31 +22,32 @@ from PIL import Image
|
|
21 |
from reportlab.lib.pagesizes import letter
|
22 |
from reportlab.pdfgen import canvas
|
23 |
|
24 |
-
# Advanced imports
|
25 |
-
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
26 |
-
from bs4 import BeautifulSoup
|
27 |
-
from PyPDF2 import PdfReader
|
28 |
-
import google_auth_oauthlib.flow
|
29 |
-
import googleapiclient.discovery
|
30 |
-
import google.auth.transport.requests
|
31 |
-
import googleapiclient.http
|
32 |
import requests
|
33 |
-
import
|
34 |
-
from
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Configure page and logging
|
41 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
42 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
43 |
logger = logging.getLogger(__name__)
|
44 |
|
45 |
-
#
|
46 |
-
celery_app = Celery('file_downloader', broker='redis://localhost:6379/0')
|
47 |
-
|
48 |
-
# Configure Google OAuth
|
49 |
GOOGLE_OAUTH_CONFIG = {
|
50 |
"web": {
|
51 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
@@ -58,7 +60,7 @@ GOOGLE_OAUTH_CONFIG = {
|
|
58 |
}
|
59 |
}
|
60 |
|
61 |
-
#
|
62 |
USER_AGENTS = [
|
63 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
64 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
@@ -66,14 +68,9 @@ USER_AGENTS = [
|
|
66 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
67 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
|
68 |
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
69 |
-
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
70 |
]
|
71 |
|
72 |
-
#
|
73 |
-
PROXY_POOL = []
|
74 |
-
CURRENT_PROXY_INDEX = 0
|
75 |
-
|
76 |
-
# -------------------- Network Interception Configuration --------------------
|
77 |
NETWORK_INTERCEPTOR_CONFIG = {
|
78 |
"enabled": False,
|
79 |
"intercept_types": ["xhr", "fetch", "document", "media"],
|
@@ -81,7 +78,7 @@ NETWORK_INTERCEPTOR_CONFIG = {
|
|
81 |
"intercept_folder": "./intercepted_data"
|
82 |
}
|
83 |
|
84 |
-
#
|
85 |
def get_random_user_agent():
|
86 |
return random.choice(USER_AGENTS)
|
87 |
|
@@ -117,8 +114,11 @@ def is_valid_file_url(url, extensions):
|
|
117 |
"""Check if URL is a valid file URL based on extension"""
|
118 |
return any(url.lower().endswith(ext) for ext in extensions)
|
119 |
|
120 |
-
#
|
121 |
def get_google_auth_url():
|
|
|
|
|
|
|
122 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
123 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
124 |
{"web": client_config},
|
@@ -133,6 +133,9 @@ def get_google_auth_url():
|
|
133 |
return authorization_url
|
134 |
|
135 |
def exchange_code_for_credentials(auth_code):
|
|
|
|
|
|
|
136 |
if not auth_code.strip():
|
137 |
return None, "No code provided."
|
138 |
try:
|
@@ -151,6 +154,9 @@ def exchange_code_for_credentials(auth_code):
|
|
151 |
return None, f"Error during token exchange: {e}"
|
152 |
|
153 |
def google_drive_upload(file_path, credentials, folder_id=None):
|
|
|
|
|
|
|
154 |
try:
|
155 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
156 |
file_metadata = {'name': os.path.basename(file_path)}
|
@@ -163,164 +169,59 @@ def google_drive_upload(file_path, credentials, folder_id=None):
|
|
163 |
return f"Error uploading to Drive: {str(e)}"
|
164 |
|
165 |
def create_drive_folder(drive_service, name):
|
|
|
|
|
|
|
166 |
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
167 |
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
168 |
return folder.get('id')
|
169 |
|
170 |
-
#
|
171 |
-
def
|
172 |
-
"""Install required system dependencies"""
|
173 |
try:
|
174 |
# Install system dependencies
|
175 |
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
176 |
packages = [
|
177 |
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
178 |
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
179 |
-
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
180 |
-
'redis-server', 'python3-dev', 'build-essential'
|
181 |
]
|
182 |
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
183 |
|
184 |
-
# Install
|
185 |
-
subprocess.run(['
|
186 |
-
|
187 |
-
# Install browsers
|
188 |
-
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
189 |
-
subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True)
|
190 |
|
191 |
st.success("Dependencies installed successfully!")
|
192 |
return True
|
193 |
except Exception as e:
|
194 |
st.error(f"Error installing dependencies: {e}")
|
195 |
-
st.info("You may need to manually install dependencies.
|
196 |
logger.error(f"Setup error: {e}")
|
197 |
traceback.print_exc()
|
198 |
return False
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
# Check Redis for Celery
|
204 |
-
redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG'
|
205 |
-
if not redis_running:
|
206 |
-
# Try to start Redis
|
207 |
-
subprocess.run(['service', 'redis-server', 'start'], check=True)
|
208 |
-
|
209 |
-
# Create directories for intercepted data
|
210 |
-
os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True)
|
211 |
-
|
212 |
-
return True
|
213 |
-
except Exception as e:
|
214 |
-
logger.error(f"Service check error: {e}")
|
215 |
-
return False
|
216 |
-
|
217 |
-
# -------------------- Network Interception Classes --------------------
|
218 |
-
class NetworkInterceptor:
|
219 |
-
"""Class to intercept network traffic using mitmproxy"""
|
220 |
-
|
221 |
-
def __init__(self, intercept_types=None, save_path=None):
|
222 |
-
self.intercept_types = intercept_types or ["xhr", "fetch", "document"]
|
223 |
-
self.save_path = save_path or "./intercepted_data"
|
224 |
-
os.makedirs(self.save_path, exist_ok=True)
|
225 |
-
self.captured_data = []
|
226 |
-
|
227 |
-
def intercept_request(self, flow):
|
228 |
-
"""Process intercepted requests"""
|
229 |
-
try:
|
230 |
-
url = flow.request.url
|
231 |
-
method = flow.request.method
|
232 |
-
content_type = flow.request.headers.get("Content-Type", "")
|
233 |
-
|
234 |
-
# Log the request
|
235 |
-
self.captured_data.append({
|
236 |
-
"type": "request",
|
237 |
-
"url": url,
|
238 |
-
"method": method,
|
239 |
-
"headers": dict(flow.request.headers),
|
240 |
-
"timestamp": time.time()
|
241 |
-
})
|
242 |
-
|
243 |
-
logger.info(f"Intercepted {method} request to {url}")
|
244 |
-
except Exception as e:
|
245 |
-
logger.error(f"Error intercepting request: {e}")
|
246 |
-
|
247 |
-
def intercept_response(self, flow):
|
248 |
-
"""Process intercepted responses"""
|
249 |
-
try:
|
250 |
-
url = flow.request.url
|
251 |
-
status_code = flow.response.status_code
|
252 |
-
content_type = flow.response.headers.get("Content-Type", "")
|
253 |
-
|
254 |
-
# Only process responses of interest based on content type
|
255 |
-
if any(t in content_type.lower() for t in ["application/pdf", "application/msword",
|
256 |
-
"application/vnd.openxmlformats",
|
257 |
-
"application/zip"]):
|
258 |
-
# Save the file
|
259 |
-
filename = os.path.basename(urlparse(url).path)
|
260 |
-
if not filename or filename == '/':
|
261 |
-
filename = f"file_{int(time.time())}"
|
262 |
-
|
263 |
-
# Try to add extension based on content type
|
264 |
-
if "pdf" in content_type:
|
265 |
-
filename += ".pdf"
|
266 |
-
elif "msword" in content_type:
|
267 |
-
filename += ".doc"
|
268 |
-
elif "openxmlformats" in content_type and "wordprocessingml" in content_type:
|
269 |
-
filename += ".docx"
|
270 |
-
elif "zip" in content_type:
|
271 |
-
filename += ".zip"
|
272 |
-
|
273 |
-
file_path = os.path.join(self.save_path, filename)
|
274 |
-
with open(file_path, "wb") as f:
|
275 |
-
f.write(flow.response.content)
|
276 |
-
|
277 |
-
logger.info(f"Saved intercepted file: {file_path}")
|
278 |
-
|
279 |
-
# Record metadata about the captured file
|
280 |
-
self.captured_data.append({
|
281 |
-
"type": "file",
|
282 |
-
"url": url,
|
283 |
-
"content_type": content_type,
|
284 |
-
"size": len(flow.response.content),
|
285 |
-
"path": file_path,
|
286 |
-
"timestamp": time.time()
|
287 |
-
})
|
288 |
-
except Exception as e:
|
289 |
-
logger.error(f"Error intercepting response: {e}")
|
290 |
-
|
291 |
-
def get_captured_files(self):
|
292 |
-
"""Return list of captured files"""
|
293 |
-
return [item for item in self.captured_data if item["type"] == "file"]
|
294 |
-
|
295 |
-
# -------------------- Browser Automation Classes --------------------
|
296 |
-
class MultiEngineBrowser:
|
297 |
-
"""Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)"""
|
298 |
-
|
299 |
-
def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True):
|
300 |
-
self.engine = engine
|
301 |
self.use_proxy = use_proxy
|
302 |
self.proxy = proxy
|
303 |
-
self.
|
|
|
|
|
|
|
304 |
self.browser = None
|
305 |
self.context = None
|
306 |
self.page = None
|
307 |
-
|
308 |
-
async def setup(self):
|
309 |
-
"""Initialize browser based on selected engine"""
|
310 |
-
if self.engine == "playwright":
|
311 |
-
return await self.setup_playwright()
|
312 |
-
elif self.engine == "pyppeteer":
|
313 |
-
return await self.setup_pyppeteer()
|
314 |
-
elif self.engine == "splash":
|
315 |
-
return await self.setup_splash()
|
316 |
-
else:
|
317 |
-
raise ValueError(f"Unsupported browser engine: {self.engine}")
|
318 |
-
|
319 |
-
async def setup_playwright(self):
|
320 |
-
"""Setup Playwright browser"""
|
321 |
-
from playwright.async_api import async_playwright
|
322 |
|
|
|
|
|
|
|
|
|
|
|
323 |
self.playwright = await async_playwright().start()
|
|
|
|
|
324 |
browser_args = [
|
325 |
'--no-sandbox',
|
326 |
'--disable-setuid-sandbox',
|
@@ -329,7 +230,7 @@ class MultiEngineBrowser:
|
|
329 |
'--disable-features=IsolateOrigins,site-per-process',
|
330 |
]
|
331 |
|
332 |
-
if self.
|
333 |
browser_args.extend([
|
334 |
'--disable-blink-features=AutomationControlled',
|
335 |
'--disable-features=IsolateOrigins'
|
@@ -343,8 +244,10 @@ class MultiEngineBrowser:
|
|
343 |
if self.use_proxy and self.proxy:
|
344 |
launch_options["proxy"] = {"server": self.proxy}
|
345 |
|
|
|
346 |
self.browser = await self.playwright.chromium.launch(**launch_options)
|
347 |
|
|
|
348 |
context_options = {
|
349 |
"viewport": {"width": 1920, "height": 1080},
|
350 |
"user_agent": get_random_user_agent(),
|
@@ -353,10 +256,10 @@ class MultiEngineBrowser:
|
|
353 |
"accept_downloads": True
|
354 |
}
|
355 |
|
|
|
356 |
self.context = await self.browser.new_context(**context_options)
|
357 |
|
358 |
-
|
359 |
-
if self.stealth:
|
360 |
await self.context.add_init_script("""
|
361 |
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
362 |
Object.defineProperty(navigator, 'plugins', {
|
@@ -366,221 +269,50 @@ class MultiEngineBrowser:
|
|
366 |
window.chrome = { runtime: {} };
|
367 |
""")
|
368 |
|
|
|
369 |
self.page = await self.context.new_page()
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
'
|
378 |
-
'
|
379 |
-
'
|
380 |
-
|
381 |
-
]
|
382 |
-
|
383 |
-
if self.stealth:
|
384 |
-
browser_args.extend([
|
385 |
-
'--disable-blink-features=AutomationControlled',
|
386 |
-
'--disable-features=IsolateOrigins'
|
387 |
-
])
|
388 |
-
|
389 |
-
launch_options = {
|
390 |
-
"headless": True,
|
391 |
-
"args": browser_args,
|
392 |
-
"ignoreHTTPSErrors": True,
|
393 |
-
"userDataDir": tempfile.mkdtemp()
|
394 |
-
}
|
395 |
-
|
396 |
-
if self.use_proxy and self.proxy:
|
397 |
-
browser_args.append(f'--proxy-server={self.proxy}')
|
398 |
-
|
399 |
-
self.browser = await launch(launch_options)
|
400 |
-
self.page = await self.browser.newPage()
|
401 |
-
|
402 |
-
# Set user agent
|
403 |
-
await self.page.setUserAgent(get_random_user_agent())
|
404 |
-
|
405 |
-
# Set viewport
|
406 |
-
await self.page.setViewport({"width": 1920, "height": 1080})
|
407 |
-
|
408 |
-
# Apply stealth features
|
409 |
-
if self.stealth:
|
410 |
-
await self.page.evaluateOnNewDocument("""
|
411 |
-
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
412 |
-
Object.defineProperty(navigator, 'plugins', {
|
413 |
-
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
|
414 |
-
});
|
415 |
-
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
416 |
-
window.chrome = { runtime: {} };
|
417 |
-
""")
|
418 |
-
|
419 |
-
return self.page
|
420 |
-
|
421 |
-
async def setup_splash(self):
|
422 |
-
"""Setup Splash browser through API"""
|
423 |
-
# Splash is typically used via HTTP API
|
424 |
-
# We'll use requests for this
|
425 |
-
self.splash_url = "http://localhost:8050/render.html"
|
426 |
-
return None # No actual page object for Splash
|
427 |
-
|
428 |
-
async def goto(self, url, wait_until=None, timeout=30000):
|
429 |
-
"""Navigate to a URL"""
|
430 |
-
if self.engine == "playwright":
|
431 |
-
return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout)
|
432 |
-
elif self.engine == "pyppeteer":
|
433 |
-
return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout)
|
434 |
-
elif self.engine == "splash":
|
435 |
-
# Use Splash HTTP API
|
436 |
-
params = {
|
437 |
-
"url": url,
|
438 |
-
"wait": min(timeout/1000, 30), # Splash uses seconds
|
439 |
-
"timeout": min(timeout/1000, 60),
|
440 |
-
"resource_timeout": min(timeout/1000, 30),
|
441 |
-
"html": 1,
|
442 |
-
"png": 0,
|
443 |
-
"render_all": 1
|
444 |
-
}
|
445 |
-
|
446 |
-
if self.use_proxy and self.proxy:
|
447 |
-
params["proxy"] = self.proxy
|
448 |
-
|
449 |
-
headers = {"User-Agent": get_random_user_agent()}
|
450 |
-
response = requests.get(self.splash_url, params=params, headers=headers)
|
451 |
-
self.last_html = response.text
|
452 |
-
return response
|
453 |
-
|
454 |
-
async def content(self):
|
455 |
-
"""Get page content"""
|
456 |
-
if self.engine == "playwright":
|
457 |
-
return await self.page.content()
|
458 |
-
elif self.engine == "pyppeteer":
|
459 |
-
return await self.page.content()
|
460 |
-
elif self.engine == "splash":
|
461 |
-
return self.last_html
|
462 |
-
|
463 |
-
async def close(self):
|
464 |
-
"""Close browser"""
|
465 |
-
if self.engine == "playwright":
|
466 |
-
if self.browser:
|
467 |
-
await self.browser.close()
|
468 |
-
if self.playwright:
|
469 |
-
await self.playwright.stop()
|
470 |
-
elif self.engine == "pyppeteer":
|
471 |
-
if self.browser:
|
472 |
-
await self.browser.close()
|
473 |
-
# No cleanup needed for Splash as it's stateless
|
474 |
-
|
475 |
-
# -------------------- Download Manager Class --------------------
|
476 |
-
class DownloadManager:
|
477 |
-
def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
|
478 |
-
self.browser_engine = browser_engine
|
479 |
-
self.use_proxy = use_proxy
|
480 |
-
self.proxy = proxy
|
481 |
-
self.query = query
|
482 |
-
self.num_results = num_results
|
483 |
-
self.use_stealth = use_stealth
|
484 |
-
self.browser = None
|
485 |
-
self.network_interceptor = None
|
486 |
-
|
487 |
-
# Configure network interception if enabled
|
488 |
-
if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
|
489 |
-
self.network_interceptor = NetworkInterceptor(
|
490 |
-
intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"],
|
491 |
-
save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"]
|
492 |
-
)
|
493 |
-
|
494 |
-
async def __aenter__(self):
|
495 |
-
# Initialize multi-engine browser
|
496 |
-
self.browser = MultiEngineBrowser(
|
497 |
-
engine=self.browser_engine,
|
498 |
-
use_proxy=self.use_proxy,
|
499 |
-
proxy=self.proxy,
|
500 |
-
stealth=self.use_stealth
|
501 |
-
)
|
502 |
-
self.page = await self.browser.setup()
|
503 |
-
|
504 |
-
# Set headers for better stealth
|
505 |
-
if self.browser_engine == "playwright":
|
506 |
-
await self.page.set_extra_http_headers({
|
507 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
508 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
509 |
-
'DNT': '1',
|
510 |
-
'Referer': 'https://www.google.com/',
|
511 |
-
'Sec-Fetch-Dest': 'document',
|
512 |
-
'Sec-Fetch-Mode': 'navigate',
|
513 |
-
'Sec-Fetch-Site': 'cross-site',
|
514 |
-
'Sec-Fetch-User': '?1',
|
515 |
-
'Upgrade-Insecure-Requests': '1'
|
516 |
-
})
|
517 |
|
518 |
return self
|
519 |
|
520 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
521 |
-
|
|
|
|
|
|
|
522 |
|
523 |
-
async def
|
524 |
-
"""Search
|
525 |
urls = []
|
526 |
try:
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
search_url = f"https://www.google.com/search?q={self.query}"
|
531 |
-
else:
|
532 |
-
raise ValueError(f"Unsupported search engine: {search_engine}")
|
533 |
-
|
534 |
-
await self.browser.goto(search_url, timeout=30000)
|
535 |
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
urls.append(href)
|
543 |
-
elif search_engine == "google":
|
544 |
-
links = await self.page.query_selector_all("div.g a[href^='http']")
|
545 |
-
for link in links[:self.num_results]:
|
546 |
-
href = await link.get_attribute('href')
|
547 |
-
if href:
|
548 |
-
urls.append(href)
|
549 |
-
elif self.browser_engine == "pyppeteer":
|
550 |
-
if search_engine == "bing":
|
551 |
-
links = await self.page.querySelectorAll("li.b_algo h2 a")
|
552 |
-
for link in links[:self.num_results]:
|
553 |
-
href = await self.page.evaluate('el => el.getAttribute("href")', link)
|
554 |
-
if href:
|
555 |
-
urls.append(href)
|
556 |
-
elif search_engine == "google":
|
557 |
-
links = await self.page.querySelectorAll("div.g a[href^='http']")
|
558 |
-
for link in links[:self.num_results]:
|
559 |
-
href = await self.page.evaluate('el => el.getAttribute("href")', link)
|
560 |
-
if href:
|
561 |
-
urls.append(href)
|
562 |
-
elif self.browser_engine == "splash":
|
563 |
-
# Parse the HTML with BeautifulSoup
|
564 |
-
soup = BeautifulSoup(self.browser.last_html, 'html.parser')
|
565 |
-
if search_engine == "bing":
|
566 |
-
links = soup.select("li.b_algo h2 a")
|
567 |
-
for link in links[:self.num_results]:
|
568 |
-
href = link.get("href")
|
569 |
-
if href:
|
570 |
-
urls.append(href)
|
571 |
-
elif search_engine == "google":
|
572 |
-
links = soup.select("div.g a[href^='http']")
|
573 |
-
for link in links[:self.num_results]:
|
574 |
-
href = link.get("href")
|
575 |
-
if href:
|
576 |
-
urls.append(href)
|
577 |
|
578 |
return urls
|
579 |
except Exception as e:
|
580 |
-
logger.error(f"Error searching
|
581 |
return []
|
582 |
|
583 |
async def get_file_size(self, url):
|
|
|
584 |
try:
|
585 |
headers = {'User-Agent': get_random_user_agent()}
|
586 |
response = requests.head(url, headers=headers, timeout=15)
|
@@ -593,6 +325,10 @@ class DownloadManager:
|
|
593 |
return "Unknown Size"
|
594 |
|
595 |
async def get_pdf_metadata(self, url):
|
|
|
|
|
|
|
|
|
596 |
try:
|
597 |
headers = {'User-Agent': get_random_user_agent()}
|
598 |
response = requests.get(url, headers=headers, timeout=15, stream=True)
|
@@ -610,6 +346,7 @@ class DownloadManager:
|
|
610 |
return {}
|
611 |
|
612 |
async def extract_real_download_url(self, url):
|
|
|
613 |
try:
|
614 |
headers = {'User-Agent': get_random_user_agent()}
|
615 |
response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
|
@@ -619,7 +356,7 @@ class DownloadManager:
|
|
619 |
return url
|
620 |
|
621 |
async def get_edu_exam_links(self, url):
|
622 |
-
"""Specialized method for educational exam websites
|
623 |
try:
|
624 |
logger.info(f"Fetching exam links from {url}")
|
625 |
links = set()
|
@@ -630,7 +367,7 @@ class DownloadManager:
|
|
630 |
response = requests.get(url, headers=headers, timeout=30)
|
631 |
|
632 |
if response.status_code == 200:
|
633 |
-
# Parse with BeautifulSoup
|
634 |
soup = BeautifulSoup(response.text, "html.parser")
|
635 |
parsed_base = urlparse(url)
|
636 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
@@ -640,26 +377,22 @@ class DownloadManager:
|
|
640 |
href = a["href"]
|
641 |
full_url = urljoin(url, href)
|
642 |
|
643 |
-
#
|
644 |
link_text = a.get_text().lower()
|
645 |
|
646 |
-
#
|
647 |
url_patterns = [
|
648 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
649 |
"/test/", "/download/", "/files/", "/assignments/",
|
650 |
-
"paper_", "question_", "exam_", "test_", "past_"
|
651 |
-
"assignment_", "sample_", "study_material", "notes_",
|
652 |
-
"/resource/", "/subject/", "/course/", "/material/"
|
653 |
]
|
654 |
|
655 |
text_patterns = [
|
656 |
"exam", "paper", "test", "question", "past", "download",
|
657 |
-
"assignment", "sample", "study", "material", "notes"
|
658 |
-
"subject", "course", "resource", "pdf", "document",
|
659 |
-
"view", "open", "get", "solution", "answer"
|
660 |
]
|
661 |
|
662 |
-
# Check
|
663 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
664 |
any(pattern in link_text for pattern in text_patterns) or \
|
665 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
@@ -667,48 +400,74 @@ class DownloadManager:
|
|
667 |
except Exception as e:
|
668 |
logger.warning(f"Request-based extraction failed: {e}")
|
669 |
|
670 |
-
# Use browser
|
671 |
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
672 |
logger.info("Using browser for enhanced link extraction")
|
673 |
|
674 |
-
# Navigate to
|
675 |
-
await self.
|
676 |
|
677 |
-
# Get page content
|
678 |
-
content = await self.
|
679 |
soup = BeautifulSoup(content, "html.parser")
|
680 |
parsed_base = urlparse(url)
|
681 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
682 |
|
683 |
-
#
|
684 |
for a in soup.find_all("a", href=True):
|
685 |
href = a["href"]
|
686 |
full_url = urljoin(url, href)
|
687 |
link_text = a.get_text().lower()
|
688 |
|
689 |
-
#
|
690 |
url_patterns = [
|
691 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
692 |
"/test/", "/download/", "/files/", "/assignments/",
|
693 |
-
"paper_", "question_", "exam_", "test_", "past_"
|
694 |
-
"assignment_", "sample_", "study_material", "notes_",
|
695 |
-
"/resource/", "/subject/", "/course/", "/material/"
|
696 |
]
|
697 |
|
698 |
text_patterns = [
|
699 |
"exam", "paper", "test", "question", "past", "download",
|
700 |
-
"assignment", "sample", "study", "material", "notes"
|
701 |
-
"subject", "course", "resource", "pdf", "document",
|
702 |
-
"view", "open", "get", "solution", "answer"
|
703 |
]
|
704 |
|
705 |
-
# Check URL and text patterns
|
706 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
707 |
any(pattern in link_text for pattern in text_patterns) or \
|
708 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
709 |
links.add(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
|
711 |
-
# Filter to likely exam documents
|
712 |
filtered_links = []
|
713 |
for link in links:
|
714 |
# Common file extensions
|
@@ -719,8 +478,7 @@ class DownloadManager:
|
|
719 |
# Common paths for exam documents
|
720 |
if any(pattern in link.lower() for pattern in [
|
721 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
722 |
-
"/pastpapers/", "/questionpapers/", "/tests/"
|
723 |
-
"/resource/", "/material/", "/notes/", "/subjectmaterial/"
|
724 |
]):
|
725 |
filtered_links.append(link)
|
726 |
|
@@ -732,6 +490,7 @@ class DownloadManager:
|
|
732 |
return []
|
733 |
|
734 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
|
|
735 |
found_files = []
|
736 |
try:
|
737 |
# Special handling for educational exam sites
|
@@ -765,7 +524,7 @@ class DownloadManager:
|
|
765 |
|
766 |
# Get metadata for PDFs
|
767 |
meta = {}
|
768 |
-
if real_url.lower().endswith('.pdf'):
|
769 |
try:
|
770 |
meta = await self.get_pdf_metadata(real_url)
|
771 |
except Exception:
|
@@ -776,18 +535,18 @@ class DownloadManager:
|
|
776 |
'filename': filename,
|
777 |
'size': size_str,
|
778 |
'metadata': meta,
|
779 |
-
'source_url': url #
|
780 |
})
|
781 |
|
782 |
# If we found exam files with the specialized method, return them
|
783 |
if found_files:
|
784 |
return found_files
|
785 |
|
786 |
-
# Standard extraction method for
|
787 |
-
await self.
|
788 |
|
789 |
# Get page content
|
790 |
-
content = await self.
|
791 |
soup = BeautifulSoup(content, 'html.parser')
|
792 |
|
793 |
# Define file extensions to look for
|
@@ -807,7 +566,7 @@ class DownloadManager:
|
|
807 |
|
808 |
# Handle PHP and download links separately
|
809 |
if '.php' in href.lower() or 'download' in href.lower():
|
810 |
-
full_url = href if href.startswith('http') else urljoin(
|
811 |
real_url = await self.extract_real_download_url(full_url)
|
812 |
if real_url and real_url != full_url:
|
813 |
filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
|
@@ -822,10 +581,10 @@ class DownloadManager:
|
|
822 |
|
823 |
# Check for direct file extensions
|
824 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
825 |
-
file_url = href if href.startswith('http') else urljoin(
|
826 |
size_str = await self.get_file_size(file_url)
|
827 |
meta = {}
|
828 |
-
if file_url.lower().endswith('.pdf'):
|
829 |
meta = await self.get_pdf_metadata(file_url)
|
830 |
found_files.append({
|
831 |
'url': file_url,
|
@@ -845,7 +604,7 @@ class DownloadManager:
|
|
845 |
break
|
846 |
|
847 |
if file_id:
|
848 |
-
# Determine if it's
|
849 |
is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
|
850 |
|
851 |
filename = f"gdrive_{file_id}"
|
@@ -869,7 +628,7 @@ class DownloadManager:
|
|
869 |
for elem in soup.find_all(elem_tag):
|
870 |
src = elem.get('src') or elem.get('data')
|
871 |
if src and any(src.lower().endswith(ext) for ext in all_exts):
|
872 |
-
file_url = src if src.startswith('http') else urljoin(
|
873 |
found_files.append({
|
874 |
'url': file_url,
|
875 |
'filename': os.path.basename(file_url.split('?')[0]),
|
@@ -893,12 +652,12 @@ class DownloadManager:
|
|
893 |
return []
|
894 |
|
895 |
async def download_file(self, file_info, save_dir, referer=None):
|
896 |
-
"""Download a file and
|
897 |
file_url = file_info['url']
|
898 |
fname = file_info['filename']
|
899 |
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
900 |
|
901 |
-
# Create unique filename
|
902 |
path = os.path.join(save_dir, fname)
|
903 |
base, ext = os.path.splitext(fname)
|
904 |
counter = 1
|
@@ -911,7 +670,7 @@ class DownloadManager:
|
|
911 |
try:
|
912 |
# Special handling for Google Drive files
|
913 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
914 |
-
# For view-only Google Drive files
|
915 |
is_view_only = file_info.get('metadata', {}).get('view_only', False)
|
916 |
if is_view_only:
|
917 |
result_path = await self.download_viewonly_google_drive(file_info, path)
|
@@ -967,7 +726,7 @@ class DownloadManager:
|
|
967 |
return None
|
968 |
|
969 |
async def download_viewonly_google_drive(self, file_info, save_path):
|
970 |
-
"""Download view-only Google Drive documents"""
|
971 |
try:
|
972 |
# Extract file ID
|
973 |
file_id = file_info.get('metadata', {}).get('file_id')
|
@@ -993,173 +752,147 @@ class DownloadManager:
|
|
993 |
|
994 |
logger.info(f"Downloading view-only Google Drive file: {file_id}")
|
995 |
|
996 |
-
# Create a dedicated browser
|
997 |
-
|
998 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
999 |
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
-
]
|
1012 |
-
)
|
1013 |
-
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
1023 |
-
await
|
1024 |
-
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
1025 |
-
Object.defineProperty(navigator, 'plugins', {
|
1026 |
-
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
|
1027 |
-
});
|
1028 |
-
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
1029 |
-
window.chrome = { runtime: {} };
|
1030 |
-
""")
|
1031 |
|
1032 |
-
|
|
|
1033 |
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
|
|
1038 |
|
1039 |
-
#
|
1040 |
-
await page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1041 |
|
1042 |
-
|
1043 |
-
temp_dir = tempfile.mkdtemp()
|
1044 |
|
1045 |
-
#
|
1046 |
-
|
1047 |
-
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
const pageCounters = document.querySelectorAll('*');
|
1056 |
-
for (const el of pageCounters) {
|
1057 |
-
const text = el.textContent || '';
|
1058 |
-
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
|
1059 |
-
if (match && match[2]) {
|
1060 |
-
return parseInt(match[2]);
|
1061 |
-
}
|
1062 |
-
}
|
1063 |
-
|
1064 |
-
// Look for paginated pages
|
1065 |
-
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
1066 |
-
if (pages.length > 0) return pages.length;
|
1067 |
-
|
1068 |
-
// Default if we can't determine
|
1069 |
-
return 20;
|
1070 |
-
}
|
1071 |
-
""")
|
1072 |
-
|
1073 |
-
logger.info(f"PDF has approximately {total_pages} pages")
|
1074 |
-
|
1075 |
-
# Take screenshots of each page
|
1076 |
-
screenshots = []
|
1077 |
-
|
1078 |
-
# First try with the page element method
|
1079 |
-
for i in range(min(total_pages, 100)): # Limit to 100 pages for safety
|
1080 |
-
try:
|
1081 |
-
# Navigate to specific page
|
1082 |
-
if i > 0:
|
1083 |
-
await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()")
|
1084 |
-
await page.wait_for_timeout(500)
|
1085 |
-
|
1086 |
-
# Wait for the page to render
|
1087 |
-
await page.wait_for_timeout(500)
|
1088 |
-
|
1089 |
-
# Take screenshot
|
1090 |
-
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
|
1091 |
-
|
1092 |
-
# Try to find the page element
|
1093 |
-
page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})')
|
1094 |
-
if page_element:
|
1095 |
-
await page_element.screenshot(path=screenshot_path)
|
1096 |
else:
|
1097 |
-
|
1098 |
-
await page.screenshot(path=screenshot_path)
|
1099 |
-
|
1100 |
-
screenshots.append(screenshot_path)
|
1101 |
-
|
1102 |
-
# Check if we should continue to next page
|
1103 |
-
if i < total_pages - 1:
|
1104 |
-
next_button = await page.query_selector('button[aria-label="Next page"]')
|
1105 |
-
if next_button:
|
1106 |
-
# Check if button is disabled
|
1107 |
-
is_disabled = await next_button.get_attribute('disabled')
|
1108 |
-
if is_disabled:
|
1109 |
-
logger.info(f"Reached last page at page {i+1}")
|
1110 |
-
break
|
1111 |
-
|
1112 |
-
# Click next page
|
1113 |
-
await next_button.click()
|
1114 |
-
await page.wait_for_timeout(1000)
|
1115 |
-
else:
|
1116 |
-
logger.info("Next page button not found")
|
1117 |
-
break
|
1118 |
-
except Exception as e:
|
1119 |
-
logger.error(f"Error capturing page {i+1}: {e}")
|
1120 |
-
continue
|
1121 |
-
|
1122 |
-
# Create PDF from screenshots
|
1123 |
-
if screenshots:
|
1124 |
-
# Get dimensions from first screenshot
|
1125 |
-
first_img = Image.open(screenshots[0])
|
1126 |
-
width, height = first_img.size
|
1127 |
-
|
1128 |
-
# Create PDF
|
1129 |
-
c = canvas.Canvas(save_path, pagesize=(width, height))
|
1130 |
-
for screenshot in screenshots:
|
1131 |
-
c.drawImage(screenshot, 0, 0, width, height)
|
1132 |
-
c.showPage()
|
1133 |
-
c.save()
|
1134 |
|
1135 |
-
#
|
1136 |
-
|
1137 |
-
os.remove(screenshot)
|
1138 |
|
1139 |
-
#
|
1140 |
-
|
|
|
|
|
|
|
|
|
1141 |
|
1142 |
-
|
1143 |
-
|
1144 |
-
logger.error("
|
1145 |
-
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1149 |
|
1150 |
-
#
|
1151 |
-
|
|
|
1152 |
|
1153 |
-
# Clean up
|
1154 |
-
os.remove(screenshot_path)
|
1155 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
1156 |
|
1157 |
return save_path
|
1158 |
-
|
1159 |
-
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1163 |
|
1164 |
return None
|
1165 |
except Exception as e:
|
@@ -1167,7 +900,7 @@ class DownloadManager:
|
|
1167 |
return None
|
1168 |
|
1169 |
async def get_sublinks(self, url, limit=10000):
|
1170 |
-
"""Extract all sublinks from a
|
1171 |
links = set()
|
1172 |
try:
|
1173 |
logger.info(f"Extracting sublinks from {url}")
|
@@ -1183,18 +916,17 @@ class DownloadManager:
|
|
1183 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
1184 |
return list(links)[:limit]
|
1185 |
|
1186 |
-
#
|
1187 |
-
await self.
|
1188 |
|
1189 |
# Get page content
|
1190 |
-
content = await self.
|
1191 |
soup = BeautifulSoup(content, 'html.parser')
|
1192 |
|
1193 |
-
#
|
1194 |
parsed_base = urlparse(url)
|
1195 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
1196 |
|
1197 |
-
# Extract all links from the page
|
1198 |
for a in soup.find_all('a', href=True):
|
1199 |
href = a['href']
|
1200 |
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
@@ -1220,85 +952,12 @@ class DownloadManager:
|
|
1220 |
logger.error(f"Error extracting sublinks: {e}")
|
1221 |
return list(links)[:limit]
|
1222 |
|
1223 |
-
@celery_app.task
|
1224 |
-
def download_file_task(file_info, save_dir, referer=None):
|
1225 |
-
"""Celery task for downloading files asynchronously"""
|
1226 |
-
# This function runs in a separate worker process
|
1227 |
-
file_url = file_info['url']
|
1228 |
-
fname = file_info['filename']
|
1229 |
-
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
1230 |
-
|
1231 |
-
# Create unique filename
|
1232 |
-
path = os.path.join(save_dir, fname)
|
1233 |
-
base, ext = os.path.splitext(fname)
|
1234 |
-
counter = 1
|
1235 |
-
while os.path.exists(path):
|
1236 |
-
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
1237 |
-
counter += 1
|
1238 |
-
|
1239 |
-
os.makedirs(save_dir, exist_ok=True)
|
1240 |
-
|
1241 |
-
try:
|
1242 |
-
# Handle Google Drive files
|
1243 |
-
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
1244 |
-
# Extract file ID
|
1245 |
-
file_id = None
|
1246 |
-
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
1247 |
-
match = re.search(pattern, file_url)
|
1248 |
-
if match:
|
1249 |
-
file_id = match.group(1)
|
1250 |
-
break
|
1251 |
-
|
1252 |
-
if file_id:
|
1253 |
-
# Try direct download
|
1254 |
-
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
|
1255 |
-
headers = {
|
1256 |
-
'User-Agent': get_random_user_agent(),
|
1257 |
-
'Referer': referer
|
1258 |
-
}
|
1259 |
-
|
1260 |
-
with requests.get(download_url, headers=headers, stream=True) as r:
|
1261 |
-
if r.status_code == 200:
|
1262 |
-
with open(path, 'wb') as f:
|
1263 |
-
for chunk in r.iter_content(chunk_size=8192):
|
1264 |
-
f.write(chunk)
|
1265 |
-
|
1266 |
-
# Check if this is HTML (common for Google Drive restrictions)
|
1267 |
-
with open(path, 'rb') as f:
|
1268 |
-
content_start = f.read(100).decode('utf-8', errors='ignore')
|
1269 |
-
if '<html' in content_start.lower():
|
1270 |
-
os.remove(path)
|
1271 |
-
return {'status': 'error', 'message': 'Received HTML instead of file'}
|
1272 |
-
|
1273 |
-
return {'status': 'success', 'path': path}
|
1274 |
-
|
1275 |
-
# Standard download for regular files
|
1276 |
-
headers = {
|
1277 |
-
'User-Agent': get_random_user_agent(),
|
1278 |
-
'Referer': referer,
|
1279 |
-
'Accept': '*/*',
|
1280 |
-
'Accept-Encoding': 'gzip, deflate, br'
|
1281 |
-
}
|
1282 |
-
|
1283 |
-
with requests.get(file_url, headers=headers, stream=True) as r:
|
1284 |
-
if r.status_code == 200:
|
1285 |
-
with open(path, 'wb') as f:
|
1286 |
-
for chunk in r.iter_content(chunk_size=8192):
|
1287 |
-
f.write(chunk)
|
1288 |
-
|
1289 |
-
return {'status': 'success', 'path': path}
|
1290 |
-
else:
|
1291 |
-
return {'status': 'error', 'message': f"HTTP error: {r.status_code}"}
|
1292 |
-
|
1293 |
-
except Exception as e:
|
1294 |
-
return {'status': 'error', 'message': str(e)}
|
1295 |
-
|
1296 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
1297 |
-
"""Perform deep search for files on
|
1298 |
if not custom_ext_list:
|
1299 |
custom_ext_list = []
|
1300 |
|
1301 |
-
#
|
1302 |
progress_text = st.empty()
|
1303 |
progress_bar = st.progress(0)
|
1304 |
file_count_text = st.empty()
|
@@ -1317,22 +976,23 @@ class DownloadManager:
|
|
1317 |
total_links = len(sublinks)
|
1318 |
progress_text.text(f"Found {total_links} sublinks to process")
|
1319 |
|
1320 |
-
#
|
1321 |
all_files = main_files.copy()
|
1322 |
|
1323 |
-
# Process each sublink
|
1324 |
-
|
1325 |
-
|
1326 |
-
|
1327 |
-
|
1328 |
-
|
1329 |
-
|
1330 |
-
|
1331 |
-
|
1332 |
-
|
1333 |
-
|
1334 |
-
|
1335 |
-
|
|
|
1336 |
|
1337 |
# Deduplicate files
|
1338 |
seen_urls = set()
|
@@ -1360,7 +1020,7 @@ class DownloadManager:
|
|
1360 |
progress_text.empty()
|
1361 |
progress_bar.empty()
|
1362 |
|
1363 |
-
#
|
1364 |
def main():
|
1365 |
st.title("Advanced File Downloader")
|
1366 |
|
@@ -1369,91 +1029,70 @@ def main():
|
|
1369 |
st.session_state.initialized = True
|
1370 |
st.session_state.discovered_files = []
|
1371 |
st.session_state.current_url = None
|
1372 |
-
st.session_state.google_creds = None
|
1373 |
st.session_state.selected_files = []
|
1374 |
st.session_state.do_deep_search = False
|
1375 |
st.session_state.deep_search_url = None
|
1376 |
st.session_state.search_results = []
|
1377 |
st.session_state.download_urls = {} # For direct download links
|
1378 |
|
1379 |
-
# Install
|
1380 |
-
if "
|
1381 |
-
with st.spinner("Setting up
|
1382 |
-
st.session_state.
|
1383 |
-
check_services()
|
1384 |
|
1385 |
-
# Sidebar
|
1386 |
with st.sidebar:
|
1387 |
-
mode = st.radio("Select Mode", ["Manual URL", "Web Search", "
|
1388 |
|
1389 |
-
with st.expander("
|
1390 |
-
search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine")
|
1391 |
-
browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine")
|
1392 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
|
1393 |
help="Enter extensions like .csv, .txt")
|
1394 |
max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
|
1395 |
sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
|
1396 |
-
|
1397 |
-
with st.expander("Advanced Options", expanded=False):
|
1398 |
use_proxy = st.checkbox("Use Proxy", key="use_proxy")
|
1399 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1400 |
use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
|
1401 |
help="Makes browser harder to detect as automated")
|
1402 |
-
enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"],
|
1403 |
-
key="enable_intercept",
|
1404 |
-
help="Intercept network traffic to find additional files")
|
1405 |
-
if enable_network_intercept:
|
1406 |
-
NETWORK_INTERCEPTOR_CONFIG["enabled"] = True
|
1407 |
-
intercept_types = st.multiselect("Intercept Types",
|
1408 |
-
["xhr", "fetch", "document", "media", "stylesheet", "image", "font"],
|
1409 |
-
default=["xhr", "fetch", "document", "media"],
|
1410 |
-
key="intercept_types")
|
1411 |
-
NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types
|
1412 |
-
else:
|
1413 |
-
NETWORK_INTERCEPTOR_CONFIG["enabled"] = False
|
1414 |
|
1415 |
-
|
1416 |
-
|
1417 |
-
|
1418 |
-
|
1419 |
-
|
1420 |
-
|
1421 |
-
|
1422 |
-
|
1423 |
-
|
|
|
1424 |
|
1425 |
# Main content area
|
1426 |
if mode == "Manual URL":
|
1427 |
st.header("Manual URL Mode")
|
1428 |
url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
|
1429 |
|
1430 |
-
|
1431 |
-
|
1432 |
-
|
1433 |
-
if
|
1434 |
-
|
1435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
|
1437 |
-
|
1438 |
-
|
1439 |
-
|
1440 |
-
|
1441 |
-
|
1442 |
-
|
1443 |
-
|
1444 |
-
|
1445 |
-
files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
|
1446 |
-
return files
|
1447 |
-
|
1448 |
-
# Run the search
|
1449 |
-
files = asyncio.run(run_deep_search())
|
1450 |
-
|
1451 |
-
if files:
|
1452 |
-
st.session_state.discovered_files = files
|
1453 |
-
st.session_state.current_url = url
|
1454 |
-
st.success(f"Found {len(files)} files!")
|
1455 |
-
else:
|
1456 |
-
st.warning("No files found.")
|
1457 |
|
1458 |
# Display and process discovered files
|
1459 |
if st.session_state.discovered_files:
|
@@ -1482,12 +1121,6 @@ def main():
|
|
1482 |
file_info = f"{filename} ({size})"
|
1483 |
|
1484 |
file_options.append((i, file_info))
|
1485 |
-
|
1486 |
-
# Generate direct download URL for this file
|
1487 |
-
if i not in st.session_state.download_urls:
|
1488 |
-
# Generate a unique key for this file
|
1489 |
-
file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode()
|
1490 |
-
st.session_state.download_urls[i] = file_key
|
1491 |
|
1492 |
# File selection multiselect
|
1493 |
selected_indices = st.multiselect(
|
@@ -1500,7 +1133,7 @@ def main():
|
|
1500 |
|
1501 |
st.session_state.selected_files = selected_indices
|
1502 |
|
1503 |
-
# Display individual
|
1504 |
if files:
|
1505 |
st.subheader("Available Files")
|
1506 |
for i, file in enumerate(files):
|
@@ -1508,8 +1141,8 @@ def main():
|
|
1508 |
st.write(f"Source: {file.get('source_url', 'Unknown')}")
|
1509 |
st.write(f"URL: {file['url']}")
|
1510 |
|
1511 |
-
# Download button for this
|
1512 |
-
if st.button(f"Download
|
1513 |
with st.spinner(f"Downloading {file['filename']}..."):
|
1514 |
# Create downloads directory
|
1515 |
download_dir = "./downloads"
|
@@ -1518,7 +1151,6 @@ def main():
|
|
1518 |
# Download the file
|
1519 |
async def download_single():
|
1520 |
async with DownloadManager(
|
1521 |
-
browser_engine=browser_engine,
|
1522 |
use_proxy=use_proxy,
|
1523 |
proxy=proxy,
|
1524 |
use_stealth=use_stealth
|
@@ -1551,15 +1183,13 @@ def main():
|
|
1551 |
if selected_indices:
|
1552 |
st.subheader("Batch Download Options")
|
1553 |
|
1554 |
-
col1, col2, col3
|
1555 |
with col1:
|
1556 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1557 |
with col2:
|
1558 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1559 |
with col3:
|
1560 |
delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
|
1561 |
-
with col4:
|
1562 |
-
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
1563 |
|
1564 |
if st.button("Download Selected Files", key="batch_download_btn"):
|
1565 |
with st.spinner(f"Downloading {len(selected_indices)} files..."):
|
@@ -1573,7 +1203,6 @@ def main():
|
|
1573 |
|
1574 |
async def download_batch():
|
1575 |
async with DownloadManager(
|
1576 |
-
browser_engine=browser_engine,
|
1577 |
use_proxy=use_proxy,
|
1578 |
proxy=proxy,
|
1579 |
use_stealth=use_stealth
|
@@ -1614,24 +1243,6 @@ def main():
|
|
1614 |
key="download_zip_btn"
|
1615 |
)
|
1616 |
|
1617 |
-
# Upload to Google Drive if requested
|
1618 |
-
if upload_to_drive and st.session_state.google_creds:
|
1619 |
-
with st.spinner("Uploading to Google Drive..."):
|
1620 |
-
drive_service = googleapiclient.discovery.build(
|
1621 |
-
"drive", "v3", credentials=st.session_state.google_creds
|
1622 |
-
)
|
1623 |
-
folder_id = create_drive_folder(
|
1624 |
-
drive_service, f"Downloads_{get_domain(url)}"
|
1625 |
-
)
|
1626 |
-
drive_id = google_drive_upload(
|
1627 |
-
zip_path, st.session_state.google_creds, folder_id
|
1628 |
-
)
|
1629 |
-
|
1630 |
-
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
1631 |
-
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
1632 |
-
else:
|
1633 |
-
st.error(drive_id)
|
1634 |
-
|
1635 |
# Delete original files if requested
|
1636 |
if delete_after:
|
1637 |
for path in downloaded_paths:
|
@@ -1650,17 +1261,16 @@ def main():
|
|
1650 |
|
1651 |
if st.button("Search", key="web_search_btn"):
|
1652 |
if query:
|
1653 |
-
with st.spinner("Searching
|
1654 |
async def run_search():
|
1655 |
async with DownloadManager(
|
1656 |
-
browser_engine=browser_engine,
|
1657 |
use_proxy=use_proxy,
|
1658 |
proxy=proxy,
|
1659 |
query=query,
|
1660 |
num_results=num_results,
|
1661 |
use_stealth=use_stealth
|
1662 |
) as dm:
|
1663 |
-
urls = await dm.
|
1664 |
return urls
|
1665 |
|
1666 |
urls = asyncio.run(run_search())
|
@@ -1693,7 +1303,6 @@ def main():
|
|
1693 |
with st.spinner("Searching for files..."):
|
1694 |
async def deep_search_result():
|
1695 |
async with DownloadManager(
|
1696 |
-
browser_engine=browser_engine,
|
1697 |
use_proxy=use_proxy,
|
1698 |
proxy=proxy,
|
1699 |
use_stealth=use_stealth
|
@@ -1709,131 +1318,63 @@ def main():
|
|
1709 |
else:
|
1710 |
st.warning("No files found on this page.")
|
1711 |
|
1712 |
-
elif mode == "
|
1713 |
-
st.header("
|
1714 |
|
1715 |
# View-only Google Drive download
|
1716 |
-
|
1717 |
-
st.write("Download protected/view-only Google Drive documents")
|
1718 |
-
|
1719 |
-
file_id = st.text_input(
|
1720 |
-
"Google Drive File ID",
|
1721 |
-
placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
|
1722 |
-
key="drive_file_id"
|
1723 |
-
)
|
1724 |
-
|
1725 |
-
if st.button("Download Document", key="drive_download_btn") and file_id:
|
1726 |
-
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
1727 |
-
# Create download directory
|
1728 |
-
download_dir = "./downloads"
|
1729 |
-
os.makedirs(download_dir, exist_ok=True)
|
1730 |
-
|
1731 |
-
# Set output path
|
1732 |
-
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
|
1733 |
-
|
1734 |
-
# Download the file
|
1735 |
-
async def download_drive_file():
|
1736 |
-
async with DownloadManager(
|
1737 |
-
browser_engine=browser_engine,
|
1738 |
-
use_proxy=use_proxy,
|
1739 |
-
proxy=proxy,
|
1740 |
-
use_stealth=use_stealth
|
1741 |
-
) as dm:
|
1742 |
-
file_info = {
|
1743 |
-
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
1744 |
-
'filename': f"gdrive_{file_id}.pdf",
|
1745 |
-
'metadata': {'file_id': file_id, 'view_only': True}
|
1746 |
-
}
|
1747 |
-
return await dm.download_viewonly_google_drive(file_info, output_path)
|
1748 |
-
|
1749 |
-
result_path = asyncio.run(download_drive_file())
|
1750 |
-
|
1751 |
-
if result_path:
|
1752 |
-
st.success("Document downloaded successfully!")
|
1753 |
-
|
1754 |
-
# Provide download link
|
1755 |
-
with open(result_path, "rb") as f:
|
1756 |
-
file_bytes = f.read()
|
1757 |
-
|
1758 |
-
st.download_button(
|
1759 |
-
label="Download PDF",
|
1760 |
-
data=file_bytes,
|
1761 |
-
file_name=os.path.basename(result_path),
|
1762 |
-
mime="application/pdf",
|
1763 |
-
key="drive_pdf_download"
|
1764 |
-
)
|
1765 |
-
else:
|
1766 |
-
st.error("Failed to download the document. Please check the file ID and try again.")
|
1767 |
|
1768 |
-
|
1769 |
-
|
1770 |
-
|
1771 |
-
|
1772 |
-
|
1773 |
-
|
1774 |
-
|
1775 |
-
|
1776 |
-
|
1777 |
-
|
1778 |
-
|
1779 |
-
|
1780 |
-
|
1781 |
-
|
1782 |
-
|
1783 |
-
|
1784 |
-
|
1785 |
-
|
1786 |
-
|
1787 |
-
|
1788 |
-
|
1789 |
-
|
1790 |
-
|
1791 |
-
|
1792 |
-
|
1793 |
-
|
1794 |
-
|
1795 |
-
|
1796 |
-
|
1797 |
-
|
1798 |
-
|
1799 |
-
|
1800 |
-
|
1801 |
-
}
|
1802 |
-
|
1803 |
-
# Download the file
|
1804 |
-
async def download_direct_file():
|
1805 |
-
async with DownloadManager(
|
1806 |
-
browser_engine=browser_engine,
|
1807 |
-
use_proxy=use_proxy,
|
1808 |
-
proxy=proxy,
|
1809 |
-
use_stealth=use_stealth
|
1810 |
-
) as dm:
|
1811 |
-
return await dm.download_file(file_info, download_dir)
|
1812 |
|
1813 |
-
|
|
|
|
|
1814 |
|
1815 |
-
|
1816 |
-
|
1817 |
-
|
1818 |
-
|
1819 |
-
|
1820 |
-
|
1821 |
-
|
1822 |
-
|
1823 |
-
|
1824 |
-
st.download_button(
|
1825 |
-
label=f"Download {os.path.basename(file_path)}",
|
1826 |
-
data=file_bytes,
|
1827 |
-
file_name=os.path.basename(file_path),
|
1828 |
-
mime=mime_type,
|
1829 |
-
key="direct_file_download"
|
1830 |
-
)
|
1831 |
-
else:
|
1832 |
-
st.error("Failed to download the file. Please check the URL and try again.")
|
1833 |
|
1834 |
# Footer
|
1835 |
st.markdown("---")
|
1836 |
-
st.markdown("Created by [Euler314](https://github.com/euler314) |
|
1837 |
|
1838 |
# Run the app
|
1839 |
if __name__ == "__main__":
|
|
|
1 |
+
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
4 |
import asyncio
|
|
|
22 |
from reportlab.lib.pagesizes import letter
|
23 |
from reportlab.pdfgen import canvas
|
24 |
|
25 |
+
# Advanced imports - only import what's installed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
import requests
|
27 |
+
from bs4 import BeautifulSoup
|
28 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
29 |
+
|
30 |
+
# Optional imports with fallbacks
|
31 |
+
try:
|
32 |
+
from PyPDF2 import PdfReader
|
33 |
+
except ImportError:
|
34 |
+
PdfReader = None
|
35 |
+
|
36 |
+
try:
|
37 |
+
import google_auth_oauthlib.flow
|
38 |
+
import googleapiclient.discovery
|
39 |
+
import google.auth.transport.requests
|
40 |
+
import googleapiclient.http
|
41 |
+
GOOGLE_DRIVE_AVAILABLE = True
|
42 |
+
except ImportError:
|
43 |
+
GOOGLE_DRIVE_AVAILABLE = False
|
44 |
|
45 |
# Configure page and logging
|
46 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
47 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
48 |
logger = logging.getLogger(__name__)
|
49 |
|
50 |
+
# Google OAuth Config
|
|
|
|
|
|
|
51 |
GOOGLE_OAUTH_CONFIG = {
|
52 |
"web": {
|
53 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
|
|
60 |
}
|
61 |
}
|
62 |
|
63 |
+
# User Agent Settings
|
64 |
USER_AGENTS = [
|
65 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
66 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
|
|
68 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
69 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
|
70 |
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
|
|
71 |
]
|
72 |
|
73 |
+
# Network Interception Configuration
|
|
|
|
|
|
|
|
|
74 |
NETWORK_INTERCEPTOR_CONFIG = {
|
75 |
"enabled": False,
|
76 |
"intercept_types": ["xhr", "fetch", "document", "media"],
|
|
|
78 |
"intercept_folder": "./intercepted_data"
|
79 |
}
|
80 |
|
81 |
+
# Utility Functions
|
82 |
def get_random_user_agent():
|
83 |
return random.choice(USER_AGENTS)
|
84 |
|
|
|
114 |
"""Check if URL is a valid file URL based on extension"""
|
115 |
return any(url.lower().endswith(ext) for ext in extensions)
|
116 |
|
117 |
+
# Google Drive Functions
|
118 |
def get_google_auth_url():
|
119 |
+
if not GOOGLE_DRIVE_AVAILABLE:
|
120 |
+
return None
|
121 |
+
|
122 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
123 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
124 |
{"web": client_config},
|
|
|
133 |
return authorization_url
|
134 |
|
135 |
def exchange_code_for_credentials(auth_code):
|
136 |
+
if not GOOGLE_DRIVE_AVAILABLE:
|
137 |
+
return None, "Google Drive API not available. Install google-auth-oauthlib and google-api-python-client."
|
138 |
+
|
139 |
if not auth_code.strip():
|
140 |
return None, "No code provided."
|
141 |
try:
|
|
|
154 |
return None, f"Error during token exchange: {e}"
|
155 |
|
156 |
def google_drive_upload(file_path, credentials, folder_id=None):
|
157 |
+
if not GOOGLE_DRIVE_AVAILABLE:
|
158 |
+
return "Google Drive API not available"
|
159 |
+
|
160 |
try:
|
161 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
162 |
file_metadata = {'name': os.path.basename(file_path)}
|
|
|
169 |
return f"Error uploading to Drive: {str(e)}"
|
170 |
|
171 |
def create_drive_folder(drive_service, name):
|
172 |
+
if not GOOGLE_DRIVE_AVAILABLE:
|
173 |
+
return None
|
174 |
+
|
175 |
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
176 |
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
177 |
return folder.get('id')
|
178 |
|
179 |
+
# Setup Playwright
|
180 |
+
def setup_playwright_dependencies():
|
181 |
+
"""Install required system dependencies for Playwright"""
|
182 |
try:
|
183 |
# Install system dependencies
|
184 |
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
185 |
packages = [
|
186 |
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
187 |
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
188 |
+
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
|
|
189 |
]
|
190 |
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
191 |
|
192 |
+
# Install Playwright browser
|
193 |
+
subprocess.run(['python', '-m', 'playwright', 'install', 'chromium'], check=True)
|
|
|
|
|
|
|
|
|
194 |
|
195 |
st.success("Dependencies installed successfully!")
|
196 |
return True
|
197 |
except Exception as e:
|
198 |
st.error(f"Error installing dependencies: {e}")
|
199 |
+
st.info("You may need to manually install dependencies.")
|
200 |
logger.error(f"Setup error: {e}")
|
201 |
traceback.print_exc()
|
202 |
return False
|
203 |
|
204 |
+
# Download Manager Class
|
205 |
+
class DownloadManager:
|
206 |
+
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
self.use_proxy = use_proxy
|
208 |
self.proxy = proxy
|
209 |
+
self.query = query
|
210 |
+
self.num_results = num_results
|
211 |
+
self.use_stealth = use_stealth
|
212 |
+
self.playwright = None
|
213 |
self.browser = None
|
214 |
self.context = None
|
215 |
self.page = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
217 |
+
# Create intercepted data folder if enabled
|
218 |
+
if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
|
219 |
+
os.makedirs(NETWORK_INTERCEPTOR_CONFIG["intercept_folder"], exist_ok=True)
|
220 |
+
|
221 |
+
async def __aenter__(self):
|
222 |
self.playwright = await async_playwright().start()
|
223 |
+
|
224 |
+
# Configure browser launch options
|
225 |
browser_args = [
|
226 |
'--no-sandbox',
|
227 |
'--disable-setuid-sandbox',
|
|
|
230 |
'--disable-features=IsolateOrigins,site-per-process',
|
231 |
]
|
232 |
|
233 |
+
if self.use_stealth:
|
234 |
browser_args.extend([
|
235 |
'--disable-blink-features=AutomationControlled',
|
236 |
'--disable-features=IsolateOrigins'
|
|
|
244 |
if self.use_proxy and self.proxy:
|
245 |
launch_options["proxy"] = {"server": self.proxy}
|
246 |
|
247 |
+
# Launch browser
|
248 |
self.browser = await self.playwright.chromium.launch(**launch_options)
|
249 |
|
250 |
+
# Configure context options
|
251 |
context_options = {
|
252 |
"viewport": {"width": 1920, "height": 1080},
|
253 |
"user_agent": get_random_user_agent(),
|
|
|
256 |
"accept_downloads": True
|
257 |
}
|
258 |
|
259 |
+
# Create context and apply stealth features
|
260 |
self.context = await self.browser.new_context(**context_options)
|
261 |
|
262 |
+
if self.use_stealth:
|
|
|
263 |
await self.context.add_init_script("""
|
264 |
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
265 |
Object.defineProperty(navigator, 'plugins', {
|
|
|
269 |
window.chrome = { runtime: {} };
|
270 |
""")
|
271 |
|
272 |
+
# Create page and set headers
|
273 |
self.page = await self.context.new_page()
|
274 |
+
await self.page.set_extra_http_headers({
|
275 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
276 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
277 |
+
'DNT': '1',
|
278 |
+
'Referer': 'https://www.google.com/',
|
279 |
+
'Sec-Fetch-Dest': 'document',
|
280 |
+
'Sec-Fetch-Mode': 'navigate',
|
281 |
+
'Sec-Fetch-Site': 'cross-site',
|
282 |
+
'Sec-Fetch-User': '?1',
|
283 |
+
'Upgrade-Insecure-Requests': '1'
|
284 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
return self
|
287 |
|
288 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
289 |
+
if self.browser:
|
290 |
+
await self.browser.close()
|
291 |
+
if self.playwright:
|
292 |
+
await self.playwright.stop()
|
293 |
|
294 |
+
async def search_bing(self):
|
295 |
+
"""Search Bing for results"""
|
296 |
urls = []
|
297 |
try:
|
298 |
+
search_url = f"https://www.bing.com/search?q={self.query}"
|
299 |
+
await self.page.goto(search_url, timeout=30000)
|
300 |
+
await self.page.wait_for_load_state('networkidle')
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
+
# Extract search results
|
303 |
+
links = await self.page.query_selector_all("li.b_algo h2 a")
|
304 |
+
for link in links[:self.num_results]:
|
305 |
+
href = await link.get_attribute('href')
|
306 |
+
if href:
|
307 |
+
urls.append(href)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
return urls
|
310 |
except Exception as e:
|
311 |
+
logger.error(f"Error searching Bing: {e}")
|
312 |
return []
|
313 |
|
314 |
async def get_file_size(self, url):
|
315 |
+
"""Get file size by making a HEAD request"""
|
316 |
try:
|
317 |
headers = {'User-Agent': get_random_user_agent()}
|
318 |
response = requests.head(url, headers=headers, timeout=15)
|
|
|
325 |
return "Unknown Size"
|
326 |
|
327 |
async def get_pdf_metadata(self, url):
|
328 |
+
"""Extract metadata from PDF files"""
|
329 |
+
if not PdfReader:
|
330 |
+
return {}
|
331 |
+
|
332 |
try:
|
333 |
headers = {'User-Agent': get_random_user_agent()}
|
334 |
response = requests.get(url, headers=headers, timeout=15, stream=True)
|
|
|
346 |
return {}
|
347 |
|
348 |
async def extract_real_download_url(self, url):
|
349 |
+
"""Follow redirects to get the final download URL"""
|
350 |
try:
|
351 |
headers = {'User-Agent': get_random_user_agent()}
|
352 |
response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
|
|
|
356 |
return url
|
357 |
|
358 |
async def get_edu_exam_links(self, url):
|
359 |
+
"""Specialized method for educational exam websites"""
|
360 |
try:
|
361 |
logger.info(f"Fetching exam links from {url}")
|
362 |
links = set()
|
|
|
367 |
response = requests.get(url, headers=headers, timeout=30)
|
368 |
|
369 |
if response.status_code == 200:
|
370 |
+
# Parse with BeautifulSoup
|
371 |
soup = BeautifulSoup(response.text, "html.parser")
|
372 |
parsed_base = urlparse(url)
|
373 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
377 |
href = a["href"]
|
378 |
full_url = urljoin(url, href)
|
379 |
|
380 |
+
# Get link text
|
381 |
link_text = a.get_text().lower()
|
382 |
|
383 |
+
# Define patterns to look for
|
384 |
url_patterns = [
|
385 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
386 |
"/test/", "/download/", "/files/", "/assignments/",
|
387 |
+
"paper_", "question_", "exam_", "test_", "past_"
|
|
|
|
|
388 |
]
|
389 |
|
390 |
text_patterns = [
|
391 |
"exam", "paper", "test", "question", "past", "download",
|
392 |
+
"assignment", "sample", "study", "material", "notes"
|
|
|
|
|
393 |
]
|
394 |
|
395 |
+
# Check for matches
|
396 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
397 |
any(pattern in link_text for pattern in text_patterns) or \
|
398 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
|
|
400 |
except Exception as e:
|
401 |
logger.warning(f"Request-based extraction failed: {e}")
|
402 |
|
403 |
+
# Use browser if few links were found or for specific sites
|
404 |
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
405 |
logger.info("Using browser for enhanced link extraction")
|
406 |
|
407 |
+
# Navigate to page
|
408 |
+
await self.page.goto(url, timeout=45000)
|
409 |
|
410 |
+
# Get page content
|
411 |
+
content = await self.page.content()
|
412 |
soup = BeautifulSoup(content, "html.parser")
|
413 |
parsed_base = urlparse(url)
|
414 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
415 |
|
416 |
+
# Find links in page
|
417 |
for a in soup.find_all("a", href=True):
|
418 |
href = a["href"]
|
419 |
full_url = urljoin(url, href)
|
420 |
link_text = a.get_text().lower()
|
421 |
|
422 |
+
# Use the same patterns as above
|
423 |
url_patterns = [
|
424 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
425 |
"/test/", "/download/", "/files/", "/assignments/",
|
426 |
+
"paper_", "question_", "exam_", "test_", "past_"
|
|
|
|
|
427 |
]
|
428 |
|
429 |
text_patterns = [
|
430 |
"exam", "paper", "test", "question", "past", "download",
|
431 |
+
"assignment", "sample", "study", "material", "notes"
|
|
|
|
|
432 |
]
|
433 |
|
|
|
434 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
435 |
any(pattern in link_text for pattern in text_patterns) or \
|
436 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
437 |
links.add(full_url)
|
438 |
+
|
439 |
+
# Try to click on elements that might reveal more links
|
440 |
+
try:
|
441 |
+
# Find and click buttons that might show more content
|
442 |
+
buttons = await self.page.query_selector_all('input[type="button"], button')
|
443 |
+
for button in buttons:
|
444 |
+
button_text = await button.text_content() or ""
|
445 |
+
button_value = await button.get_attribute("value") or ""
|
446 |
+
|
447 |
+
# Only click on promising buttons
|
448 |
+
if any(keyword in (button_text + button_value).lower() for keyword in
|
449 |
+
["show", "view", "display", "list", "exam", "paper", "test"]):
|
450 |
+
try:
|
451 |
+
await button.click()
|
452 |
+
await self.page.wait_for_timeout(1000)
|
453 |
+
|
454 |
+
# Get any new links
|
455 |
+
new_content = await self.page.content()
|
456 |
+
new_soup = BeautifulSoup(new_content, "html.parser")
|
457 |
+
for a in new_soup.find_all("a", href=True):
|
458 |
+
href = a["href"]
|
459 |
+
full_url = urljoin(url, href)
|
460 |
+
|
461 |
+
# Check if it's a file link
|
462 |
+
if any(full_url.lower().endswith(ext) for ext in
|
463 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
464 |
+
links.add(full_url)
|
465 |
+
except Exception as e:
|
466 |
+
logger.warning(f"Error clicking button: {e}")
|
467 |
+
except Exception as e:
|
468 |
+
logger.warning(f"Error with interactive elements: {e}")
|
469 |
|
470 |
+
# Filter links to likely contain exam documents
|
471 |
filtered_links = []
|
472 |
for link in links:
|
473 |
# Common file extensions
|
|
|
478 |
# Common paths for exam documents
|
479 |
if any(pattern in link.lower() for pattern in [
|
480 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
481 |
+
"/pastpapers/", "/questionpapers/", "/tests/"
|
|
|
482 |
]):
|
483 |
filtered_links.append(link)
|
484 |
|
|
|
490 |
return []
|
491 |
|
492 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
493 |
+
"""Extract all downloadable files from a webpage"""
|
494 |
found_files = []
|
495 |
try:
|
496 |
# Special handling for educational exam sites
|
|
|
524 |
|
525 |
# Get metadata for PDFs
|
526 |
meta = {}
|
527 |
+
if real_url.lower().endswith('.pdf') and PdfReader:
|
528 |
try:
|
529 |
meta = await self.get_pdf_metadata(real_url)
|
530 |
except Exception:
|
|
|
535 |
'filename': filename,
|
536 |
'size': size_str,
|
537 |
'metadata': meta,
|
538 |
+
'source_url': url # Keep track of source page
|
539 |
})
|
540 |
|
541 |
# If we found exam files with the specialized method, return them
|
542 |
if found_files:
|
543 |
return found_files
|
544 |
|
545 |
+
# Standard extraction method for regular websites
|
546 |
+
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
547 |
|
548 |
# Get page content
|
549 |
+
content = await self.page.content()
|
550 |
soup = BeautifulSoup(content, 'html.parser')
|
551 |
|
552 |
# Define file extensions to look for
|
|
|
566 |
|
567 |
# Handle PHP and download links separately
|
568 |
if '.php' in href.lower() or 'download' in href.lower():
|
569 |
+
full_url = href if href.startswith('http') else urljoin(url, href)
|
570 |
real_url = await self.extract_real_download_url(full_url)
|
571 |
if real_url and real_url != full_url:
|
572 |
filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
|
|
|
581 |
|
582 |
# Check for direct file extensions
|
583 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
584 |
+
file_url = href if href.startswith('http') else urljoin(url, href)
|
585 |
size_str = await self.get_file_size(file_url)
|
586 |
meta = {}
|
587 |
+
if file_url.lower().endswith('.pdf') and PdfReader:
|
588 |
meta = await self.get_pdf_metadata(file_url)
|
589 |
found_files.append({
|
590 |
'url': file_url,
|
|
|
604 |
break
|
605 |
|
606 |
if file_id:
|
607 |
+
# Determine if it's view-only
|
608 |
is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
|
609 |
|
610 |
filename = f"gdrive_{file_id}"
|
|
|
628 |
for elem in soup.find_all(elem_tag):
|
629 |
src = elem.get('src') or elem.get('data')
|
630 |
if src and any(src.lower().endswith(ext) for ext in all_exts):
|
631 |
+
file_url = src if src.startswith('http') else urljoin(url, src)
|
632 |
found_files.append({
|
633 |
'url': file_url,
|
634 |
'filename': os.path.basename(file_url.split('?')[0]),
|
|
|
652 |
return []
|
653 |
|
654 |
async def download_file(self, file_info, save_dir, referer=None):
|
655 |
+
"""Download a file and save it to disk"""
|
656 |
file_url = file_info['url']
|
657 |
fname = file_info['filename']
|
658 |
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
659 |
|
660 |
+
# Create unique filename
|
661 |
path = os.path.join(save_dir, fname)
|
662 |
base, ext = os.path.splitext(fname)
|
663 |
counter = 1
|
|
|
670 |
try:
|
671 |
# Special handling for Google Drive files
|
672 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
673 |
+
# For view-only Google Drive files
|
674 |
is_view_only = file_info.get('metadata', {}).get('view_only', False)
|
675 |
if is_view_only:
|
676 |
result_path = await self.download_viewonly_google_drive(file_info, path)
|
|
|
726 |
return None
|
727 |
|
728 |
async def download_viewonly_google_drive(self, file_info, save_path):
|
729 |
+
"""Download view-only Google Drive documents using Playwright"""
|
730 |
try:
|
731 |
# Extract file ID
|
732 |
file_id = file_info.get('metadata', {}).get('file_id')
|
|
|
752 |
|
753 |
logger.info(f"Downloading view-only Google Drive file: {file_id}")
|
754 |
|
755 |
+
# Create a dedicated browser instance for this operation
|
756 |
+
async with async_playwright() as p:
|
757 |
+
browser = await p.chromium.launch(
|
758 |
+
headless=True,
|
759 |
+
args=[
|
760 |
+
'--no-sandbox',
|
761 |
+
'--disable-setuid-sandbox',
|
762 |
+
'--disable-dev-shm-usage',
|
763 |
+
'--disable-web-security',
|
764 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
765 |
+
'--disable-blink-features=AutomationControlled'
|
766 |
+
]
|
767 |
+
)
|
768 |
|
769 |
+
# Create context
|
770 |
+
context = await browser.new_context(
|
771 |
+
viewport={'width': 1600, 'height': 1200},
|
772 |
+
user_agent=get_random_user_agent(),
|
773 |
+
accept_downloads=True
|
774 |
+
)
|
775 |
+
|
776 |
+
# Add stealth script
|
777 |
+
await context.add_init_script("""
|
778 |
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
779 |
+
Object.defineProperty(navigator, 'plugins', {
|
780 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
|
781 |
+
});
|
782 |
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
783 |
+
window.chrome = { runtime: {} };
|
784 |
+
""")
|
785 |
+
|
786 |
+
page = await context.new_page()
|
787 |
+
|
788 |
+
try:
|
789 |
+
# Navigate to the file
|
790 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
|
791 |
+
await page.wait_for_load_state('networkidle')
|
792 |
+
await page.wait_for_timeout(5000) # Wait for rendering
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
|
794 |
+
# Create temp directory
|
795 |
+
temp_dir = tempfile.mkdtemp()
|
796 |
|
797 |
+
# For PDF files, take screenshots of each page
|
798 |
+
if file_type == 'pdf':
|
799 |
+
# Create directory for screenshots
|
800 |
+
screenshots_dir = os.path.join(temp_dir, "screenshots")
|
801 |
+
os.makedirs(screenshots_dir, exist_ok=True)
|
802 |
|
803 |
+
# Get page count estimation
|
804 |
+
total_pages = await page.evaluate("""
|
805 |
+
() => {
|
806 |
+
// Look for page counters
|
807 |
+
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
808 |
+
const text = el.textContent || '';
|
809 |
+
return /\\d+\\s*\\/\\s*\\d+/.test(text);
|
810 |
+
});
|
811 |
+
|
812 |
+
if (pageCounters.length > 0) {
|
813 |
+
const text = pageCounters[0].textContent || '';
|
814 |
+
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
|
815 |
+
if (match && match[2]) return parseInt(match[2]);
|
816 |
+
}
|
817 |
+
|
818 |
+
// Look for page elements
|
819 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
820 |
+
if (pages.length > 0) return pages.length;
|
821 |
+
|
822 |
+
// Default
|
823 |
+
return 20;
|
824 |
+
}
|
825 |
+
""")
|
826 |
|
827 |
+
logger.info(f"PDF has approximately {total_pages} pages")
|
|
|
828 |
|
829 |
+
# Capture screenshots page by page
|
830 |
+
screenshots = []
|
831 |
+
for i in range(min(total_pages, 100)): # Limit to 100 pages
|
832 |
+
try:
|
833 |
+
# Go to specific page
|
834 |
+
if i > 0:
|
835 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
836 |
+
if next_button:
|
837 |
+
await next_button.click()
|
838 |
+
await page.wait_for_timeout(1000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
839 |
else:
|
840 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
|
842 |
+
# Take screenshot
|
843 |
+
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
|
|
|
844 |
|
845 |
+
# Try to find page element
|
846 |
+
page_element = await page.query_selector('.drive-viewer-paginated-page')
|
847 |
+
if page_element:
|
848 |
+
await page_element.screenshot(path=screenshot_path)
|
849 |
+
else:
|
850 |
+
await page.screenshot(path=screenshot_path)
|
851 |
|
852 |
+
screenshots.append(screenshot_path)
|
853 |
+
except Exception as e:
|
854 |
+
logger.error(f"Error capturing page {i+1}: {e}")
|
855 |
+
continue
|
856 |
+
|
857 |
+
# Create PDF from screenshots
|
858 |
+
if screenshots:
|
859 |
+
# Get dimensions from first screenshot
|
860 |
+
first_img = Image.open(screenshots[0])
|
861 |
+
width, height = first_img.size
|
862 |
+
|
863 |
+
# Create PDF
|
864 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
865 |
+
for screenshot in screenshots:
|
866 |
+
c.drawImage(screenshot, 0, 0, width, height)
|
867 |
+
c.showPage()
|
868 |
+
c.save()
|
869 |
|
870 |
+
# Clean up screenshots
|
871 |
+
for screenshot in screenshots:
|
872 |
+
os.remove(screenshot)
|
873 |
|
874 |
+
# Clean up temp directory
|
|
|
875 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
876 |
|
877 |
return save_path
|
878 |
+
else:
|
879 |
+
logger.error("No screenshots captured")
|
880 |
+
else:
|
881 |
+
# For non-PDF files, just take a screenshot
|
882 |
+
screenshot_path = os.path.join(temp_dir, "file.png")
|
883 |
+
await page.screenshot(path=screenshot_path)
|
884 |
+
|
885 |
+
# Copy to destination
|
886 |
+
shutil.copy(screenshot_path, save_path)
|
887 |
+
|
888 |
+
# Clean up
|
889 |
+
os.remove(screenshot_path)
|
890 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
891 |
+
|
892 |
+
return save_path
|
893 |
+
|
894 |
+
finally:
|
895 |
+
await browser.close()
|
896 |
|
897 |
return None
|
898 |
except Exception as e:
|
|
|
900 |
return None
|
901 |
|
902 |
async def get_sublinks(self, url, limit=10000):
|
903 |
+
"""Extract all sublinks from a webpage"""
|
904 |
links = set()
|
905 |
try:
|
906 |
logger.info(f"Extracting sublinks from {url}")
|
|
|
916 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
917 |
return list(links)[:limit]
|
918 |
|
919 |
+
# Navigate to the page
|
920 |
+
await self.page.goto(url, timeout=30000)
|
921 |
|
922 |
# Get page content
|
923 |
+
content = await self.page.content()
|
924 |
soup = BeautifulSoup(content, 'html.parser')
|
925 |
|
926 |
+
# Extract all links from the page
|
927 |
parsed_base = urlparse(url)
|
928 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
929 |
|
|
|
930 |
for a in soup.find_all('a', href=True):
|
931 |
href = a['href']
|
932 |
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
|
|
952 |
logger.error(f"Error extracting sublinks: {e}")
|
953 |
return list(links)[:limit]
|
954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
955 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
956 |
+
"""Perform deep search for files on website and its subpages"""
|
957 |
if not custom_ext_list:
|
958 |
custom_ext_list = []
|
959 |
|
960 |
+
# Set up progress indicators
|
961 |
progress_text = st.empty()
|
962 |
progress_bar = st.progress(0)
|
963 |
file_count_text = st.empty()
|
|
|
976 |
total_links = len(sublinks)
|
977 |
progress_text.text(f"Found {total_links} sublinks to process")
|
978 |
|
979 |
+
# Always include main page files
|
980 |
all_files = main_files.copy()
|
981 |
|
982 |
+
# Process each sublink if there are any
|
983 |
+
if sublinks:
|
984 |
+
for i, sublink in enumerate(sublinks, 1):
|
985 |
+
progress = i / max(total_links, 1) # Avoid division by zero
|
986 |
+
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
987 |
+
progress_bar.progress(progress)
|
988 |
+
|
989 |
+
try:
|
990 |
+
# Extract files from sublink
|
991 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
992 |
+
all_files.extend(sub_files)
|
993 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
994 |
+
except Exception as e:
|
995 |
+
logger.warning(f"Error processing sublink {sublink}: {e}")
|
996 |
|
997 |
# Deduplicate files
|
998 |
seen_urls = set()
|
|
|
1020 |
progress_text.empty()
|
1021 |
progress_bar.empty()
|
1022 |
|
1023 |
+
# Main App
|
1024 |
def main():
|
1025 |
st.title("Advanced File Downloader")
|
1026 |
|
|
|
1029 |
st.session_state.initialized = True
|
1030 |
st.session_state.discovered_files = []
|
1031 |
st.session_state.current_url = None
|
|
|
1032 |
st.session_state.selected_files = []
|
1033 |
st.session_state.do_deep_search = False
|
1034 |
st.session_state.deep_search_url = None
|
1035 |
st.session_state.search_results = []
|
1036 |
st.session_state.download_urls = {} # For direct download links
|
1037 |
|
1038 |
+
# Install Playwright if needed
|
1039 |
+
if "playwright_installed" not in st.session_state:
|
1040 |
+
with st.spinner("Setting up Playwright. This may take a minute..."):
|
1041 |
+
st.session_state.playwright_installed = setup_playwright_dependencies()
|
|
|
1042 |
|
1043 |
+
# Sidebar settings
|
1044 |
with st.sidebar:
|
1045 |
+
mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Google Drive"], key="mode_select")
|
1046 |
|
1047 |
+
with st.expander("Advanced Options", expanded=True):
|
|
|
|
|
1048 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
|
1049 |
help="Enter extensions like .csv, .txt")
|
1050 |
max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
|
1051 |
sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
|
|
|
|
|
1052 |
use_proxy = st.checkbox("Use Proxy", key="use_proxy")
|
1053 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1054 |
use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
|
1055 |
help="Makes browser harder to detect as automated")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1056 |
|
1057 |
+
if GOOGLE_DRIVE_AVAILABLE:
|
1058 |
+
with st.expander("Google Drive Integration", expanded=False):
|
1059 |
+
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
1060 |
+
auth_url = get_google_auth_url()
|
1061 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
1062 |
+
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
|
1063 |
+
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
|
1064 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
1065 |
+
st.session_state.google_creds = creds
|
1066 |
+
st.write(msg)
|
1067 |
|
1068 |
# Main content area
|
1069 |
if mode == "Manual URL":
|
1070 |
st.header("Manual URL Mode")
|
1071 |
url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
|
1072 |
|
1073 |
+
if st.button("Deep Search", key="deep_search_btn"):
|
1074 |
+
if url:
|
1075 |
+
# Process custom extensions
|
1076 |
+
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
|
1077 |
+
|
1078 |
+
with st.spinner("Searching for files..."):
|
1079 |
+
async def run_deep_search():
|
1080 |
+
async with DownloadManager(
|
1081 |
+
use_proxy=use_proxy,
|
1082 |
+
proxy=proxy,
|
1083 |
+
use_stealth=use_stealth
|
1084 |
+
) as dm:
|
1085 |
+
files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
|
1086 |
+
return files
|
1087 |
|
1088 |
+
files = asyncio.run(run_deep_search())
|
1089 |
+
|
1090 |
+
if files:
|
1091 |
+
st.session_state.discovered_files = files
|
1092 |
+
st.session_state.current_url = url
|
1093 |
+
st.success(f"Found {len(files)} files!")
|
1094 |
+
else:
|
1095 |
+
st.warning("No files found.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1096 |
|
1097 |
# Display and process discovered files
|
1098 |
if st.session_state.discovered_files:
|
|
|
1121 |
file_info = f"{filename} ({size})"
|
1122 |
|
1123 |
file_options.append((i, file_info))
|
|
|
|
|
|
|
|
|
|
|
|
|
1124 |
|
1125 |
# File selection multiselect
|
1126 |
selected_indices = st.multiselect(
|
|
|
1133 |
|
1134 |
st.session_state.selected_files = selected_indices
|
1135 |
|
1136 |
+
# Display individual download buttons
|
1137 |
if files:
|
1138 |
st.subheader("Available Files")
|
1139 |
for i, file in enumerate(files):
|
|
|
1141 |
st.write(f"Source: {file.get('source_url', 'Unknown')}")
|
1142 |
st.write(f"URL: {file['url']}")
|
1143 |
|
1144 |
+
# Download button for this file
|
1145 |
+
if st.button(f"Download", key=f"download_single_{i}"):
|
1146 |
with st.spinner(f"Downloading {file['filename']}..."):
|
1147 |
# Create downloads directory
|
1148 |
download_dir = "./downloads"
|
|
|
1151 |
# Download the file
|
1152 |
async def download_single():
|
1153 |
async with DownloadManager(
|
|
|
1154 |
use_proxy=use_proxy,
|
1155 |
proxy=proxy,
|
1156 |
use_stealth=use_stealth
|
|
|
1183 |
if selected_indices:
|
1184 |
st.subheader("Batch Download Options")
|
1185 |
|
1186 |
+
col1, col2, col3 = st.columns(3)
|
1187 |
with col1:
|
1188 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1189 |
with col2:
|
1190 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1191 |
with col3:
|
1192 |
delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
|
|
|
|
|
1193 |
|
1194 |
if st.button("Download Selected Files", key="batch_download_btn"):
|
1195 |
with st.spinner(f"Downloading {len(selected_indices)} files..."):
|
|
|
1203 |
|
1204 |
async def download_batch():
|
1205 |
async with DownloadManager(
|
|
|
1206 |
use_proxy=use_proxy,
|
1207 |
proxy=proxy,
|
1208 |
use_stealth=use_stealth
|
|
|
1243 |
key="download_zip_btn"
|
1244 |
)
|
1245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1246 |
# Delete original files if requested
|
1247 |
if delete_after:
|
1248 |
for path in downloaded_paths:
|
|
|
1261 |
|
1262 |
if st.button("Search", key="web_search_btn"):
|
1263 |
if query:
|
1264 |
+
with st.spinner("Searching..."):
|
1265 |
async def run_search():
|
1266 |
async with DownloadManager(
|
|
|
1267 |
use_proxy=use_proxy,
|
1268 |
proxy=proxy,
|
1269 |
query=query,
|
1270 |
num_results=num_results,
|
1271 |
use_stealth=use_stealth
|
1272 |
) as dm:
|
1273 |
+
urls = await dm.search_bing()
|
1274 |
return urls
|
1275 |
|
1276 |
urls = asyncio.run(run_search())
|
|
|
1303 |
with st.spinner("Searching for files..."):
|
1304 |
async def deep_search_result():
|
1305 |
async with DownloadManager(
|
|
|
1306 |
use_proxy=use_proxy,
|
1307 |
proxy=proxy,
|
1308 |
use_stealth=use_stealth
|
|
|
1318 |
else:
|
1319 |
st.warning("No files found on this page.")
|
1320 |
|
1321 |
+
elif mode == "Google Drive" and GOOGLE_DRIVE_AVAILABLE:
|
1322 |
+
st.header("Google Drive Download")
|
1323 |
|
1324 |
# View-only Google Drive download
|
1325 |
+
st.write("Download protected/view-only Google Drive documents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1326 |
|
1327 |
+
file_id = st.text_input(
|
1328 |
+
"Google Drive File ID",
|
1329 |
+
placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
|
1330 |
+
key="drive_file_id"
|
1331 |
+
)
|
1332 |
+
|
1333 |
+
if st.button("Download Document", key="drive_download_btn") and file_id:
|
1334 |
+
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
1335 |
+
# Create download directory
|
1336 |
+
download_dir = "./downloads"
|
1337 |
+
os.makedirs(download_dir, exist_ok=True)
|
1338 |
+
|
1339 |
+
# Set output path
|
1340 |
+
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
|
1341 |
+
|
1342 |
+
# Download the file
|
1343 |
+
async def download_drive_file():
|
1344 |
+
async with DownloadManager(
|
1345 |
+
use_proxy=use_proxy,
|
1346 |
+
proxy=proxy,
|
1347 |
+
use_stealth=use_stealth
|
1348 |
+
) as dm:
|
1349 |
+
file_info = {
|
1350 |
+
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
1351 |
+
'filename': f"gdrive_{file_id}.pdf",
|
1352 |
+
'metadata': {'file_id': file_id, 'view_only': True}
|
1353 |
+
}
|
1354 |
+
return await dm.download_viewonly_google_drive(file_info, output_path)
|
1355 |
+
|
1356 |
+
result_path = asyncio.run(download_drive_file())
|
1357 |
+
|
1358 |
+
if result_path:
|
1359 |
+
st.success("Document downloaded successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1360 |
|
1361 |
+
# Provide download link
|
1362 |
+
with open(result_path, "rb") as f:
|
1363 |
+
file_bytes = f.read()
|
1364 |
|
1365 |
+
st.download_button(
|
1366 |
+
label="Download PDF",
|
1367 |
+
data=file_bytes,
|
1368 |
+
file_name=os.path.basename(result_path),
|
1369 |
+
mime="application/pdf",
|
1370 |
+
key="drive_pdf_download"
|
1371 |
+
)
|
1372 |
+
else:
|
1373 |
+
st.error("Failed to download the document. Please check the file ID and try again.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1374 |
|
1375 |
# Footer
|
1376 |
st.markdown("---")
|
1377 |
+
st.markdown("Created by [Euler314](https://github.com/euler314) | Advanced File Downloader")
|
1378 |
|
1379 |
# Run the app
|
1380 |
if __name__ == "__main__":
|