Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
4 |
import asyncio
|
@@ -22,32 +21,31 @@ from PIL import Image
|
|
22 |
from reportlab.lib.pagesizes import letter
|
23 |
from reportlab.pdfgen import canvas
|
24 |
|
25 |
-
# Advanced imports
|
26 |
-
import requests
|
27 |
-
from bs4 import BeautifulSoup
|
28 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
except ImportError:
|
43 |
-
GOOGLE_DRIVE_AVAILABLE = False
|
44 |
|
45 |
# Configure page and logging
|
46 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
47 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
48 |
logger = logging.getLogger(__name__)
|
49 |
|
50 |
-
#
|
|
|
|
|
|
|
51 |
GOOGLE_OAUTH_CONFIG = {
|
52 |
"web": {
|
53 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
@@ -60,7 +58,7 @@ GOOGLE_OAUTH_CONFIG = {
|
|
60 |
}
|
61 |
}
|
62 |
|
63 |
-
# User Agent Settings
|
64 |
USER_AGENTS = [
|
65 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
66 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
@@ -68,9 +66,14 @@ USER_AGENTS = [
|
|
68 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
69 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
|
70 |
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
|
|
71 |
]
|
72 |
|
73 |
-
#
|
|
|
|
|
|
|
|
|
74 |
NETWORK_INTERCEPTOR_CONFIG = {
|
75 |
"enabled": False,
|
76 |
"intercept_types": ["xhr", "fetch", "document", "media"],
|
@@ -78,7 +81,7 @@ NETWORK_INTERCEPTOR_CONFIG = {
|
|
78 |
"intercept_folder": "./intercepted_data"
|
79 |
}
|
80 |
|
81 |
-
# Utility Functions
|
82 |
def get_random_user_agent():
|
83 |
return random.choice(USER_AGENTS)
|
84 |
|
@@ -114,11 +117,8 @@ def is_valid_file_url(url, extensions):
|
|
114 |
"""Check if URL is a valid file URL based on extension"""
|
115 |
return any(url.lower().endswith(ext) for ext in extensions)
|
116 |
|
117 |
-
# Google Drive Functions
|
118 |
def get_google_auth_url():
|
119 |
-
if not GOOGLE_DRIVE_AVAILABLE:
|
120 |
-
return None
|
121 |
-
|
122 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
123 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
124 |
{"web": client_config},
|
@@ -133,9 +133,6 @@ def get_google_auth_url():
|
|
133 |
return authorization_url
|
134 |
|
135 |
def exchange_code_for_credentials(auth_code):
|
136 |
-
if not GOOGLE_DRIVE_AVAILABLE:
|
137 |
-
return None, "Google Drive API not available. Install google-auth-oauthlib and google-api-python-client."
|
138 |
-
|
139 |
if not auth_code.strip():
|
140 |
return None, "No code provided."
|
141 |
try:
|
@@ -154,9 +151,6 @@ def exchange_code_for_credentials(auth_code):
|
|
154 |
return None, f"Error during token exchange: {e}"
|
155 |
|
156 |
def google_drive_upload(file_path, credentials, folder_id=None):
|
157 |
-
if not GOOGLE_DRIVE_AVAILABLE:
|
158 |
-
return "Google Drive API not available"
|
159 |
-
|
160 |
try:
|
161 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
162 |
file_metadata = {'name': os.path.basename(file_path)}
|
@@ -169,59 +163,164 @@ def google_drive_upload(file_path, credentials, folder_id=None):
|
|
169 |
return f"Error uploading to Drive: {str(e)}"
|
170 |
|
171 |
def create_drive_folder(drive_service, name):
|
172 |
-
if not GOOGLE_DRIVE_AVAILABLE:
|
173 |
-
return None
|
174 |
-
|
175 |
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
176 |
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
177 |
return folder.get('id')
|
178 |
|
179 |
-
# Setup
|
180 |
-
def
|
181 |
-
"""Install required system dependencies
|
182 |
try:
|
183 |
# Install system dependencies
|
184 |
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
185 |
packages = [
|
186 |
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
187 |
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
188 |
-
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
|
|
189 |
]
|
190 |
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
191 |
|
192 |
-
# Install
|
193 |
-
subprocess.run(['
|
|
|
|
|
|
|
|
|
194 |
|
195 |
st.success("Dependencies installed successfully!")
|
196 |
return True
|
197 |
except Exception as e:
|
198 |
st.error(f"Error installing dependencies: {e}")
|
199 |
-
st.info("You may need to manually install dependencies.")
|
200 |
logger.error(f"Setup error: {e}")
|
201 |
traceback.print_exc()
|
202 |
return False
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
self.use_proxy = use_proxy
|
208 |
self.proxy = proxy
|
209 |
-
self.
|
210 |
-
self.num_results = num_results
|
211 |
-
self.use_stealth = use_stealth
|
212 |
-
self.playwright = None
|
213 |
self.browser = None
|
214 |
self.context = None
|
215 |
self.page = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
217 |
-
# Create intercepted data folder if enabled
|
218 |
-
if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
|
219 |
-
os.makedirs(NETWORK_INTERCEPTOR_CONFIG["intercept_folder"], exist_ok=True)
|
220 |
-
|
221 |
-
async def __aenter__(self):
|
222 |
self.playwright = await async_playwright().start()
|
223 |
-
|
224 |
-
# Configure browser launch options
|
225 |
browser_args = [
|
226 |
'--no-sandbox',
|
227 |
'--disable-setuid-sandbox',
|
@@ -230,7 +329,7 @@ class DownloadManager:
|
|
230 |
'--disable-features=IsolateOrigins,site-per-process',
|
231 |
]
|
232 |
|
233 |
-
if self.
|
234 |
browser_args.extend([
|
235 |
'--disable-blink-features=AutomationControlled',
|
236 |
'--disable-features=IsolateOrigins'
|
@@ -244,10 +343,8 @@ class DownloadManager:
|
|
244 |
if self.use_proxy and self.proxy:
|
245 |
launch_options["proxy"] = {"server": self.proxy}
|
246 |
|
247 |
-
# Launch browser
|
248 |
self.browser = await self.playwright.chromium.launch(**launch_options)
|
249 |
|
250 |
-
# Configure context options
|
251 |
context_options = {
|
252 |
"viewport": {"width": 1920, "height": 1080},
|
253 |
"user_agent": get_random_user_agent(),
|
@@ -256,10 +353,10 @@ class DownloadManager:
|
|
256 |
"accept_downloads": True
|
257 |
}
|
258 |
|
259 |
-
# Create context and apply stealth features
|
260 |
self.context = await self.browser.new_context(**context_options)
|
261 |
|
262 |
-
|
|
|
263 |
await self.context.add_init_script("""
|
264 |
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
265 |
Object.defineProperty(navigator, 'plugins', {
|
@@ -269,50 +366,221 @@ class DownloadManager:
|
|
269 |
window.chrome = { runtime: {} };
|
270 |
""")
|
271 |
|
272 |
-
# Create page and set headers
|
273 |
self.page = await self.context.new_page()
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
'
|
282 |
-
'
|
283 |
-
'
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
return self
|
287 |
|
288 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
289 |
-
|
290 |
-
await self.browser.close()
|
291 |
-
if self.playwright:
|
292 |
-
await self.playwright.stop()
|
293 |
|
294 |
-
async def
|
295 |
-
"""Search
|
296 |
urls = []
|
297 |
try:
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
return urls
|
310 |
except Exception as e:
|
311 |
-
logger.error(f"Error searching
|
312 |
return []
|
313 |
|
314 |
async def get_file_size(self, url):
|
315 |
-
"""Get file size by making a HEAD request"""
|
316 |
try:
|
317 |
headers = {'User-Agent': get_random_user_agent()}
|
318 |
response = requests.head(url, headers=headers, timeout=15)
|
@@ -325,10 +593,6 @@ class DownloadManager:
|
|
325 |
return "Unknown Size"
|
326 |
|
327 |
async def get_pdf_metadata(self, url):
|
328 |
-
"""Extract metadata from PDF files"""
|
329 |
-
if not PdfReader:
|
330 |
-
return {}
|
331 |
-
|
332 |
try:
|
333 |
headers = {'User-Agent': get_random_user_agent()}
|
334 |
response = requests.get(url, headers=headers, timeout=15, stream=True)
|
@@ -346,7 +610,6 @@ class DownloadManager:
|
|
346 |
return {}
|
347 |
|
348 |
async def extract_real_download_url(self, url):
|
349 |
-
"""Follow redirects to get the final download URL"""
|
350 |
try:
|
351 |
headers = {'User-Agent': get_random_user_agent()}
|
352 |
response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
|
@@ -356,7 +619,7 @@ class DownloadManager:
|
|
356 |
return url
|
357 |
|
358 |
async def get_edu_exam_links(self, url):
|
359 |
-
"""Specialized method for educational exam websites"""
|
360 |
try:
|
361 |
logger.info(f"Fetching exam links from {url}")
|
362 |
links = set()
|
@@ -367,7 +630,7 @@ class DownloadManager:
|
|
367 |
response = requests.get(url, headers=headers, timeout=30)
|
368 |
|
369 |
if response.status_code == 200:
|
370 |
-
# Parse with BeautifulSoup
|
371 |
soup = BeautifulSoup(response.text, "html.parser")
|
372 |
parsed_base = urlparse(url)
|
373 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
@@ -377,22 +640,26 @@ class DownloadManager:
|
|
377 |
href = a["href"]
|
378 |
full_url = urljoin(url, href)
|
379 |
|
380 |
-
#
|
381 |
link_text = a.get_text().lower()
|
382 |
|
383 |
-
#
|
384 |
url_patterns = [
|
385 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
386 |
"/test/", "/download/", "/files/", "/assignments/",
|
387 |
-
"paper_", "question_", "exam_", "test_", "past_"
|
|
|
|
|
388 |
]
|
389 |
|
390 |
text_patterns = [
|
391 |
"exam", "paper", "test", "question", "past", "download",
|
392 |
-
"assignment", "sample", "study", "material", "notes"
|
|
|
|
|
393 |
]
|
394 |
|
395 |
-
# Check
|
396 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
397 |
any(pattern in link_text for pattern in text_patterns) or \
|
398 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
@@ -400,74 +667,48 @@ class DownloadManager:
|
|
400 |
except Exception as e:
|
401 |
logger.warning(f"Request-based extraction failed: {e}")
|
402 |
|
403 |
-
# Use browser if
|
404 |
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
405 |
logger.info("Using browser for enhanced link extraction")
|
406 |
|
407 |
-
# Navigate to page
|
408 |
-
await self.
|
409 |
|
410 |
-
# Get page content
|
411 |
-
content = await self.
|
412 |
soup = BeautifulSoup(content, "html.parser")
|
413 |
parsed_base = urlparse(url)
|
414 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
415 |
|
416 |
-
#
|
417 |
for a in soup.find_all("a", href=True):
|
418 |
href = a["href"]
|
419 |
full_url = urljoin(url, href)
|
420 |
link_text = a.get_text().lower()
|
421 |
|
422 |
-
#
|
423 |
url_patterns = [
|
424 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
425 |
"/test/", "/download/", "/files/", "/assignments/",
|
426 |
-
"paper_", "question_", "exam_", "test_", "past_"
|
|
|
|
|
427 |
]
|
428 |
|
429 |
text_patterns = [
|
430 |
"exam", "paper", "test", "question", "past", "download",
|
431 |
-
"assignment", "sample", "study", "material", "notes"
|
|
|
|
|
432 |
]
|
433 |
|
|
|
434 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
435 |
any(pattern in link_text for pattern in text_patterns) or \
|
436 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
437 |
links.add(full_url)
|
438 |
-
|
439 |
-
# Try to click on elements that might reveal more links
|
440 |
-
try:
|
441 |
-
# Find and click buttons that might show more content
|
442 |
-
buttons = await self.page.query_selector_all('input[type="button"], button')
|
443 |
-
for button in buttons:
|
444 |
-
button_text = await button.text_content() or ""
|
445 |
-
button_value = await button.get_attribute("value") or ""
|
446 |
-
|
447 |
-
# Only click on promising buttons
|
448 |
-
if any(keyword in (button_text + button_value).lower() for keyword in
|
449 |
-
["show", "view", "display", "list", "exam", "paper", "test"]):
|
450 |
-
try:
|
451 |
-
await button.click()
|
452 |
-
await self.page.wait_for_timeout(1000)
|
453 |
-
|
454 |
-
# Get any new links
|
455 |
-
new_content = await self.page.content()
|
456 |
-
new_soup = BeautifulSoup(new_content, "html.parser")
|
457 |
-
for a in new_soup.find_all("a", href=True):
|
458 |
-
href = a["href"]
|
459 |
-
full_url = urljoin(url, href)
|
460 |
-
|
461 |
-
# Check if it's a file link
|
462 |
-
if any(full_url.lower().endswith(ext) for ext in
|
463 |
-
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
464 |
-
links.add(full_url)
|
465 |
-
except Exception as e:
|
466 |
-
logger.warning(f"Error clicking button: {e}")
|
467 |
-
except Exception as e:
|
468 |
-
logger.warning(f"Error with interactive elements: {e}")
|
469 |
|
470 |
-
# Filter
|
471 |
filtered_links = []
|
472 |
for link in links:
|
473 |
# Common file extensions
|
@@ -478,7 +719,8 @@ class DownloadManager:
|
|
478 |
# Common paths for exam documents
|
479 |
if any(pattern in link.lower() for pattern in [
|
480 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
481 |
-
"/pastpapers/", "/questionpapers/", "/tests/"
|
|
|
482 |
]):
|
483 |
filtered_links.append(link)
|
484 |
|
@@ -490,7 +732,6 @@ class DownloadManager:
|
|
490 |
return []
|
491 |
|
492 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
493 |
-
"""Extract all downloadable files from a webpage"""
|
494 |
found_files = []
|
495 |
try:
|
496 |
# Special handling for educational exam sites
|
@@ -524,7 +765,7 @@ class DownloadManager:
|
|
524 |
|
525 |
# Get metadata for PDFs
|
526 |
meta = {}
|
527 |
-
if real_url.lower().endswith('.pdf')
|
528 |
try:
|
529 |
meta = await self.get_pdf_metadata(real_url)
|
530 |
except Exception:
|
@@ -535,18 +776,18 @@ class DownloadManager:
|
|
535 |
'filename': filename,
|
536 |
'size': size_str,
|
537 |
'metadata': meta,
|
538 |
-
'source_url': url #
|
539 |
})
|
540 |
|
541 |
# If we found exam files with the specialized method, return them
|
542 |
if found_files:
|
543 |
return found_files
|
544 |
|
545 |
-
# Standard extraction method for
|
546 |
-
await self.
|
547 |
|
548 |
# Get page content
|
549 |
-
content = await self.
|
550 |
soup = BeautifulSoup(content, 'html.parser')
|
551 |
|
552 |
# Define file extensions to look for
|
@@ -566,7 +807,7 @@ class DownloadManager:
|
|
566 |
|
567 |
# Handle PHP and download links separately
|
568 |
if '.php' in href.lower() or 'download' in href.lower():
|
569 |
-
full_url = href if href.startswith('http') else urljoin(
|
570 |
real_url = await self.extract_real_download_url(full_url)
|
571 |
if real_url and real_url != full_url:
|
572 |
filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
|
@@ -581,10 +822,10 @@ class DownloadManager:
|
|
581 |
|
582 |
# Check for direct file extensions
|
583 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
584 |
-
file_url = href if href.startswith('http') else urljoin(
|
585 |
size_str = await self.get_file_size(file_url)
|
586 |
meta = {}
|
587 |
-
if file_url.lower().endswith('.pdf')
|
588 |
meta = await self.get_pdf_metadata(file_url)
|
589 |
found_files.append({
|
590 |
'url': file_url,
|
@@ -604,7 +845,7 @@ class DownloadManager:
|
|
604 |
break
|
605 |
|
606 |
if file_id:
|
607 |
-
# Determine if it's view-only
|
608 |
is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
|
609 |
|
610 |
filename = f"gdrive_{file_id}"
|
@@ -628,7 +869,7 @@ class DownloadManager:
|
|
628 |
for elem in soup.find_all(elem_tag):
|
629 |
src = elem.get('src') or elem.get('data')
|
630 |
if src and any(src.lower().endswith(ext) for ext in all_exts):
|
631 |
-
file_url = src if src.startswith('http') else urljoin(
|
632 |
found_files.append({
|
633 |
'url': file_url,
|
634 |
'filename': os.path.basename(file_url.split('?')[0]),
|
@@ -652,12 +893,12 @@ class DownloadManager:
|
|
652 |
return []
|
653 |
|
654 |
async def download_file(self, file_info, save_dir, referer=None):
|
655 |
-
"""Download a file and
|
656 |
file_url = file_info['url']
|
657 |
fname = file_info['filename']
|
658 |
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
659 |
|
660 |
-
# Create unique filename
|
661 |
path = os.path.join(save_dir, fname)
|
662 |
base, ext = os.path.splitext(fname)
|
663 |
counter = 1
|
@@ -670,7 +911,7 @@ class DownloadManager:
|
|
670 |
try:
|
671 |
# Special handling for Google Drive files
|
672 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
673 |
-
# For view-only Google Drive files
|
674 |
is_view_only = file_info.get('metadata', {}).get('view_only', False)
|
675 |
if is_view_only:
|
676 |
result_path = await self.download_viewonly_google_drive(file_info, path)
|
@@ -726,7 +967,7 @@ class DownloadManager:
|
|
726 |
return None
|
727 |
|
728 |
async def download_viewonly_google_drive(self, file_info, save_path):
|
729 |
-
"""Download view-only Google Drive documents
|
730 |
try:
|
731 |
# Extract file ID
|
732 |
file_id = file_info.get('metadata', {}).get('file_id')
|
@@ -752,147 +993,173 @@ class DownloadManager:
|
|
752 |
|
753 |
logger.info(f"Downloading view-only Google Drive file: {file_id}")
|
754 |
|
755 |
-
# Create a dedicated browser
|
756 |
-
|
757 |
-
|
758 |
-
headless=True,
|
759 |
-
args=[
|
760 |
-
'--no-sandbox',
|
761 |
-
'--disable-setuid-sandbox',
|
762 |
-
'--disable-dev-shm-usage',
|
763 |
-
'--disable-web-security',
|
764 |
-
'--disable-features=IsolateOrigins,site-per-process',
|
765 |
-
'--disable-blink-features=AutomationControlled'
|
766 |
-
]
|
767 |
-
)
|
768 |
-
|
769 |
-
# Create context
|
770 |
-
context = await browser.new_context(
|
771 |
-
viewport={'width': 1600, 'height': 1200},
|
772 |
-
user_agent=get_random_user_agent(),
|
773 |
-
accept_downloads=True
|
774 |
-
)
|
775 |
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
await
|
792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
|
794 |
-
|
795 |
-
temp_dir = tempfile.mkdtemp()
|
796 |
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
os.makedirs(screenshots_dir, exist_ok=True)
|
802 |
|
803 |
-
#
|
804 |
-
|
805 |
-
() => {
|
806 |
-
// Look for page counters
|
807 |
-
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
808 |
-
const text = el.textContent || '';
|
809 |
-
return /\\d+\\s*\\/\\s*\\d+/.test(text);
|
810 |
-
});
|
811 |
-
|
812 |
-
if (pageCounters.length > 0) {
|
813 |
-
const text = pageCounters[0].textContent || '';
|
814 |
-
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
|
815 |
-
if (match && match[2]) return parseInt(match[2]);
|
816 |
-
}
|
817 |
-
|
818 |
-
// Look for page elements
|
819 |
-
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
820 |
-
if (pages.length > 0) return pages.length;
|
821 |
-
|
822 |
-
// Default
|
823 |
-
return 20;
|
824 |
-
}
|
825 |
-
""")
|
826 |
|
827 |
-
|
|
|
828 |
|
829 |
-
#
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
839 |
else:
|
840 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
|
842 |
-
#
|
843 |
-
|
|
|
|
|
|
|
|
|
844 |
|
845 |
-
#
|
846 |
-
|
847 |
-
|
848 |
-
await page_element.screenshot(path=screenshot_path)
|
849 |
-
else:
|
850 |
-
await page.screenshot(path=screenshot_path)
|
851 |
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
#
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
# Create PDF
|
864 |
-
c = canvas.Canvas(save_path, pagesize=(width, height))
|
865 |
-
for screenshot in screenshots:
|
866 |
-
c.drawImage(screenshot, 0, 0, width, height)
|
867 |
-
c.showPage()
|
868 |
-
c.save()
|
869 |
|
870 |
-
#
|
871 |
-
|
872 |
-
os.remove(screenshot)
|
873 |
|
874 |
-
# Clean up
|
|
|
875 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
876 |
|
877 |
return save_path
|
878 |
-
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
883 |
-
await page.screenshot(path=screenshot_path)
|
884 |
-
|
885 |
-
# Copy to destination
|
886 |
-
shutil.copy(screenshot_path, save_path)
|
887 |
-
|
888 |
-
# Clean up
|
889 |
-
os.remove(screenshot_path)
|
890 |
-
shutil.rmtree(temp_dir, ignore_errors=True)
|
891 |
-
|
892 |
-
return save_path
|
893 |
-
|
894 |
-
finally:
|
895 |
-
await browser.close()
|
896 |
|
897 |
return None
|
898 |
except Exception as e:
|
@@ -900,7 +1167,7 @@ class DownloadManager:
|
|
900 |
return None
|
901 |
|
902 |
async def get_sublinks(self, url, limit=10000):
|
903 |
-
"""Extract all sublinks from a
|
904 |
links = set()
|
905 |
try:
|
906 |
logger.info(f"Extracting sublinks from {url}")
|
@@ -916,17 +1183,18 @@ class DownloadManager:
|
|
916 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
917 |
return list(links)[:limit]
|
918 |
|
919 |
-
#
|
920 |
-
await self.
|
921 |
|
922 |
# Get page content
|
923 |
-
content = await self.
|
924 |
soup = BeautifulSoup(content, 'html.parser')
|
925 |
|
926 |
-
#
|
927 |
parsed_base = urlparse(url)
|
928 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
929 |
|
|
|
930 |
for a in soup.find_all('a', href=True):
|
931 |
href = a['href']
|
932 |
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
@@ -952,12 +1220,85 @@ class DownloadManager:
|
|
952 |
logger.error(f"Error extracting sublinks: {e}")
|
953 |
return list(links)[:limit]
|
954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
955 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
956 |
-
"""Perform deep search for files on website and its subpages"""
|
957 |
if not custom_ext_list:
|
958 |
custom_ext_list = []
|
959 |
|
960 |
-
#
|
961 |
progress_text = st.empty()
|
962 |
progress_bar = st.progress(0)
|
963 |
file_count_text = st.empty()
|
@@ -976,23 +1317,22 @@ class DownloadManager:
|
|
976 |
total_links = len(sublinks)
|
977 |
progress_text.text(f"Found {total_links} sublinks to process")
|
978 |
|
979 |
-
#
|
980 |
all_files = main_files.copy()
|
981 |
|
982 |
-
# Process each sublink
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
logger.warning(f"Error processing sublink {sublink}: {e}")
|
996 |
|
997 |
# Deduplicate files
|
998 |
seen_urls = set()
|
@@ -1020,7 +1360,7 @@ class DownloadManager:
|
|
1020 |
progress_text.empty()
|
1021 |
progress_bar.empty()
|
1022 |
|
1023 |
-
# Main App
|
1024 |
def main():
|
1025 |
st.title("Advanced File Downloader")
|
1026 |
|
@@ -1029,70 +1369,91 @@ def main():
|
|
1029 |
st.session_state.initialized = True
|
1030 |
st.session_state.discovered_files = []
|
1031 |
st.session_state.current_url = None
|
|
|
1032 |
st.session_state.selected_files = []
|
1033 |
st.session_state.do_deep_search = False
|
1034 |
st.session_state.deep_search_url = None
|
1035 |
st.session_state.search_results = []
|
1036 |
st.session_state.download_urls = {} # For direct download links
|
1037 |
|
1038 |
-
# Install
|
1039 |
-
if "
|
1040 |
-
with st.spinner("Setting up
|
1041 |
-
st.session_state.
|
|
|
1042 |
|
1043 |
-
# Sidebar
|
1044 |
with st.sidebar:
|
1045 |
-
mode = st.radio("Select Mode", ["Manual URL", "Web Search", "
|
1046 |
|
1047 |
-
with st.expander("
|
|
|
|
|
1048 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
|
1049 |
help="Enter extensions like .csv, .txt")
|
1050 |
max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
|
1051 |
sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
|
|
|
|
|
1052 |
use_proxy = st.checkbox("Use Proxy", key="use_proxy")
|
1053 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1054 |
use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
|
1055 |
help="Makes browser harder to detect as automated")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1056 |
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
st.write(msg)
|
1067 |
|
1068 |
# Main content area
|
1069 |
if mode == "Manual URL":
|
1070 |
st.header("Manual URL Mode")
|
1071 |
url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
|
1072 |
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
1078 |
-
|
1079 |
-
async def run_deep_search():
|
1080 |
-
async with DownloadManager(
|
1081 |
-
use_proxy=use_proxy,
|
1082 |
-
proxy=proxy,
|
1083 |
-
use_stealth=use_stealth
|
1084 |
-
) as dm:
|
1085 |
-
files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
|
1086 |
-
return files
|
1087 |
-
|
1088 |
-
files = asyncio.run(run_deep_search())
|
1089 |
|
1090 |
-
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1096 |
|
1097 |
# Display and process discovered files
|
1098 |
if st.session_state.discovered_files:
|
@@ -1121,6 +1482,12 @@ def main():
|
|
1121 |
file_info = f"{filename} ({size})"
|
1122 |
|
1123 |
file_options.append((i, file_info))
|
|
|
|
|
|
|
|
|
|
|
|
|
1124 |
|
1125 |
# File selection multiselect
|
1126 |
selected_indices = st.multiselect(
|
@@ -1133,7 +1500,7 @@ def main():
|
|
1133 |
|
1134 |
st.session_state.selected_files = selected_indices
|
1135 |
|
1136 |
-
# Display individual download
|
1137 |
if files:
|
1138 |
st.subheader("Available Files")
|
1139 |
for i, file in enumerate(files):
|
@@ -1141,8 +1508,8 @@ def main():
|
|
1141 |
st.write(f"Source: {file.get('source_url', 'Unknown')}")
|
1142 |
st.write(f"URL: {file['url']}")
|
1143 |
|
1144 |
-
# Download button for this file
|
1145 |
-
if st.button(f"Download", key=f"download_single_{i}"):
|
1146 |
with st.spinner(f"Downloading {file['filename']}..."):
|
1147 |
# Create downloads directory
|
1148 |
download_dir = "./downloads"
|
@@ -1151,6 +1518,7 @@ def main():
|
|
1151 |
# Download the file
|
1152 |
async def download_single():
|
1153 |
async with DownloadManager(
|
|
|
1154 |
use_proxy=use_proxy,
|
1155 |
proxy=proxy,
|
1156 |
use_stealth=use_stealth
|
@@ -1183,13 +1551,15 @@ def main():
|
|
1183 |
if selected_indices:
|
1184 |
st.subheader("Batch Download Options")
|
1185 |
|
1186 |
-
col1, col2, col3 = st.columns(
|
1187 |
with col1:
|
1188 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1189 |
with col2:
|
1190 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1191 |
with col3:
|
1192 |
delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
|
|
|
|
|
1193 |
|
1194 |
if st.button("Download Selected Files", key="batch_download_btn"):
|
1195 |
with st.spinner(f"Downloading {len(selected_indices)} files..."):
|
@@ -1203,6 +1573,7 @@ def main():
|
|
1203 |
|
1204 |
async def download_batch():
|
1205 |
async with DownloadManager(
|
|
|
1206 |
use_proxy=use_proxy,
|
1207 |
proxy=proxy,
|
1208 |
use_stealth=use_stealth
|
@@ -1243,6 +1614,24 @@ def main():
|
|
1243 |
key="download_zip_btn"
|
1244 |
)
|
1245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1246 |
# Delete original files if requested
|
1247 |
if delete_after:
|
1248 |
for path in downloaded_paths:
|
@@ -1261,16 +1650,17 @@ def main():
|
|
1261 |
|
1262 |
if st.button("Search", key="web_search_btn"):
|
1263 |
if query:
|
1264 |
-
with st.spinner("Searching..."):
|
1265 |
async def run_search():
|
1266 |
async with DownloadManager(
|
|
|
1267 |
use_proxy=use_proxy,
|
1268 |
proxy=proxy,
|
1269 |
query=query,
|
1270 |
num_results=num_results,
|
1271 |
use_stealth=use_stealth
|
1272 |
) as dm:
|
1273 |
-
urls = await dm.
|
1274 |
return urls
|
1275 |
|
1276 |
urls = asyncio.run(run_search())
|
@@ -1303,6 +1693,7 @@ def main():
|
|
1303 |
with st.spinner("Searching for files..."):
|
1304 |
async def deep_search_result():
|
1305 |
async with DownloadManager(
|
|
|
1306 |
use_proxy=use_proxy,
|
1307 |
proxy=proxy,
|
1308 |
use_stealth=use_stealth
|
@@ -1318,63 +1709,131 @@ def main():
|
|
1318 |
else:
|
1319 |
st.warning("No files found on this page.")
|
1320 |
|
1321 |
-
elif mode == "
|
1322 |
-
st.header("
|
1323 |
|
1324 |
# View-only Google Drive download
|
1325 |
-
st.
|
1326 |
-
|
1327 |
-
|
1328 |
-
|
1329 |
-
|
1330 |
-
|
1331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1332 |
|
1333 |
-
|
1334 |
-
|
1335 |
-
|
1336 |
-
|
1337 |
-
|
1338 |
-
|
1339 |
-
|
1340 |
-
|
1341 |
-
|
1342 |
-
|
1343 |
-
|
1344 |
-
|
1345 |
-
|
1346 |
-
|
1347 |
-
|
1348 |
-
|
1349 |
-
|
1350 |
-
|
1351 |
-
|
1352 |
-
|
1353 |
-
|
1354 |
-
return await dm.download_viewonly_google_drive(file_info, output_path)
|
1355 |
-
|
1356 |
-
result_path = asyncio.run(download_drive_file())
|
1357 |
-
|
1358 |
-
if result_path:
|
1359 |
-
st.success("Document downloaded successfully!")
|
1360 |
|
1361 |
-
#
|
1362 |
-
|
1363 |
-
|
|
|
|
|
1364 |
|
1365 |
-
|
1366 |
-
|
1367 |
-
|
1368 |
-
file_name
|
1369 |
-
|
1370 |
-
|
1371 |
-
|
1372 |
-
|
1373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1374 |
|
1375 |
# Footer
|
1376 |
st.markdown("---")
|
1377 |
-
st.markdown("Created by [Euler314](https://github.com/euler314) |
|
1378 |
|
1379 |
# Run the app
|
1380 |
if __name__ == "__main__":
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import asyncio
|
|
|
21 |
from reportlab.lib.pagesizes import letter
|
22 |
from reportlab.pdfgen import canvas
|
23 |
|
24 |
+
# Advanced imports
|
|
|
|
|
25 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
26 |
+
from bs4 import BeautifulSoup
|
27 |
+
from PyPDF2 import PdfReader
|
28 |
+
import google_auth_oauthlib.flow
|
29 |
+
import googleapiclient.discovery
|
30 |
+
import google.auth.transport.requests
|
31 |
+
import googleapiclient.http
|
32 |
+
import requests
|
33 |
+
import celery
|
34 |
+
from celery import Celery
|
35 |
+
import splash
|
36 |
+
import pyppeteer
|
37 |
+
import mitmproxy
|
38 |
+
from mitmproxy import http
|
|
|
|
|
39 |
|
40 |
# Configure page and logging
|
41 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
42 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
43 |
logger = logging.getLogger(__name__)
|
44 |
|
45 |
+
# Initialize Celery for distributed task processing
|
46 |
+
celery_app = Celery('file_downloader', broker='redis://localhost:6379/0')
|
47 |
+
|
48 |
+
# Configure Google OAuth
|
49 |
GOOGLE_OAUTH_CONFIG = {
|
50 |
"web": {
|
51 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
|
|
58 |
}
|
59 |
}
|
60 |
|
61 |
+
# -------------------- User Agent Settings --------------------
|
62 |
USER_AGENTS = [
|
63 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
64 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
|
|
66 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
67 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
|
68 |
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
69 |
+
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
70 |
]
|
71 |
|
72 |
+
# -------------------- Proxy Management --------------------
|
73 |
+
PROXY_POOL = []
|
74 |
+
CURRENT_PROXY_INDEX = 0
|
75 |
+
|
76 |
+
# -------------------- Network Interception Configuration --------------------
|
77 |
NETWORK_INTERCEPTOR_CONFIG = {
|
78 |
"enabled": False,
|
79 |
"intercept_types": ["xhr", "fetch", "document", "media"],
|
|
|
81 |
"intercept_folder": "./intercepted_data"
|
82 |
}
|
83 |
|
84 |
+
# -------------------- Utility Functions --------------------
|
85 |
def get_random_user_agent():
|
86 |
return random.choice(USER_AGENTS)
|
87 |
|
|
|
117 |
"""Check if URL is a valid file URL based on extension"""
|
118 |
return any(url.lower().endswith(ext) for ext in extensions)
|
119 |
|
120 |
+
# -------------------- Google Drive Functions --------------------
|
121 |
def get_google_auth_url():
|
|
|
|
|
|
|
122 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
123 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
124 |
{"web": client_config},
|
|
|
133 |
return authorization_url
|
134 |
|
135 |
def exchange_code_for_credentials(auth_code):
|
|
|
|
|
|
|
136 |
if not auth_code.strip():
|
137 |
return None, "No code provided."
|
138 |
try:
|
|
|
151 |
return None, f"Error during token exchange: {e}"
|
152 |
|
153 |
def google_drive_upload(file_path, credentials, folder_id=None):
|
|
|
|
|
|
|
154 |
try:
|
155 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
156 |
file_metadata = {'name': os.path.basename(file_path)}
|
|
|
163 |
return f"Error uploading to Drive: {str(e)}"
|
164 |
|
165 |
def create_drive_folder(drive_service, name):
|
|
|
|
|
|
|
166 |
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
167 |
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
168 |
return folder.get('id')
|
169 |
|
170 |
+
# -------------------- Setup Functions --------------------
|
171 |
+
def setup_dependencies():
|
172 |
+
"""Install required system dependencies"""
|
173 |
try:
|
174 |
# Install system dependencies
|
175 |
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
176 |
packages = [
|
177 |
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
178 |
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
179 |
+
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0',
|
180 |
+
'redis-server', 'python3-dev', 'build-essential'
|
181 |
]
|
182 |
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
183 |
|
184 |
+
# Install Python packages
|
185 |
+
subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True)
|
186 |
+
|
187 |
+
# Install browsers
|
188 |
+
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
189 |
+
subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True)
|
190 |
|
191 |
st.success("Dependencies installed successfully!")
|
192 |
return True
|
193 |
except Exception as e:
|
194 |
st.error(f"Error installing dependencies: {e}")
|
195 |
+
st.info("You may need to manually install dependencies. Check console for details.")
|
196 |
logger.error(f"Setup error: {e}")
|
197 |
traceback.print_exc()
|
198 |
return False
|
199 |
|
200 |
+
def check_services():
|
201 |
+
"""Check if required services are running"""
|
202 |
+
try:
|
203 |
+
# Check Redis for Celery
|
204 |
+
redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG'
|
205 |
+
if not redis_running:
|
206 |
+
# Try to start Redis
|
207 |
+
subprocess.run(['service', 'redis-server', 'start'], check=True)
|
208 |
+
|
209 |
+
# Create directories for intercepted data
|
210 |
+
os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True)
|
211 |
+
|
212 |
+
return True
|
213 |
+
except Exception as e:
|
214 |
+
logger.error(f"Service check error: {e}")
|
215 |
+
return False
|
216 |
+
|
217 |
+
# -------------------- Network Interception Classes --------------------
|
218 |
+
class NetworkInterceptor:
|
219 |
+
"""Class to intercept network traffic using mitmproxy"""
|
220 |
+
|
221 |
+
def __init__(self, intercept_types=None, save_path=None):
|
222 |
+
self.intercept_types = intercept_types or ["xhr", "fetch", "document"]
|
223 |
+
self.save_path = save_path or "./intercepted_data"
|
224 |
+
os.makedirs(self.save_path, exist_ok=True)
|
225 |
+
self.captured_data = []
|
226 |
+
|
227 |
+
def intercept_request(self, flow):
|
228 |
+
"""Process intercepted requests"""
|
229 |
+
try:
|
230 |
+
url = flow.request.url
|
231 |
+
method = flow.request.method
|
232 |
+
content_type = flow.request.headers.get("Content-Type", "")
|
233 |
+
|
234 |
+
# Log the request
|
235 |
+
self.captured_data.append({
|
236 |
+
"type": "request",
|
237 |
+
"url": url,
|
238 |
+
"method": method,
|
239 |
+
"headers": dict(flow.request.headers),
|
240 |
+
"timestamp": time.time()
|
241 |
+
})
|
242 |
+
|
243 |
+
logger.info(f"Intercepted {method} request to {url}")
|
244 |
+
except Exception as e:
|
245 |
+
logger.error(f"Error intercepting request: {e}")
|
246 |
+
|
247 |
+
def intercept_response(self, flow):
|
248 |
+
"""Process intercepted responses"""
|
249 |
+
try:
|
250 |
+
url = flow.request.url
|
251 |
+
status_code = flow.response.status_code
|
252 |
+
content_type = flow.response.headers.get("Content-Type", "")
|
253 |
+
|
254 |
+
# Only process responses of interest based on content type
|
255 |
+
if any(t in content_type.lower() for t in ["application/pdf", "application/msword",
|
256 |
+
"application/vnd.openxmlformats",
|
257 |
+
"application/zip"]):
|
258 |
+
# Save the file
|
259 |
+
filename = os.path.basename(urlparse(url).path)
|
260 |
+
if not filename or filename == '/':
|
261 |
+
filename = f"file_{int(time.time())}"
|
262 |
+
|
263 |
+
# Try to add extension based on content type
|
264 |
+
if "pdf" in content_type:
|
265 |
+
filename += ".pdf"
|
266 |
+
elif "msword" in content_type:
|
267 |
+
filename += ".doc"
|
268 |
+
elif "openxmlformats" in content_type and "wordprocessingml" in content_type:
|
269 |
+
filename += ".docx"
|
270 |
+
elif "zip" in content_type:
|
271 |
+
filename += ".zip"
|
272 |
+
|
273 |
+
file_path = os.path.join(self.save_path, filename)
|
274 |
+
with open(file_path, "wb") as f:
|
275 |
+
f.write(flow.response.content)
|
276 |
+
|
277 |
+
logger.info(f"Saved intercepted file: {file_path}")
|
278 |
+
|
279 |
+
# Record metadata about the captured file
|
280 |
+
self.captured_data.append({
|
281 |
+
"type": "file",
|
282 |
+
"url": url,
|
283 |
+
"content_type": content_type,
|
284 |
+
"size": len(flow.response.content),
|
285 |
+
"path": file_path,
|
286 |
+
"timestamp": time.time()
|
287 |
+
})
|
288 |
+
except Exception as e:
|
289 |
+
logger.error(f"Error intercepting response: {e}")
|
290 |
+
|
291 |
+
def get_captured_files(self):
|
292 |
+
"""Return list of captured files"""
|
293 |
+
return [item for item in self.captured_data if item["type"] == "file"]
|
294 |
+
|
295 |
+
# -------------------- Browser Automation Classes --------------------
|
296 |
+
class MultiEngineBrowser:
|
297 |
+
"""Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)"""
|
298 |
+
|
299 |
+
def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True):
|
300 |
+
self.engine = engine
|
301 |
self.use_proxy = use_proxy
|
302 |
self.proxy = proxy
|
303 |
+
self.stealth = stealth
|
|
|
|
|
|
|
304 |
self.browser = None
|
305 |
self.context = None
|
306 |
self.page = None
|
307 |
+
|
308 |
+
async def setup(self):
|
309 |
+
"""Initialize browser based on selected engine"""
|
310 |
+
if self.engine == "playwright":
|
311 |
+
return await self.setup_playwright()
|
312 |
+
elif self.engine == "pyppeteer":
|
313 |
+
return await self.setup_pyppeteer()
|
314 |
+
elif self.engine == "splash":
|
315 |
+
return await self.setup_splash()
|
316 |
+
else:
|
317 |
+
raise ValueError(f"Unsupported browser engine: {self.engine}")
|
318 |
+
|
319 |
+
async def setup_playwright(self):
|
320 |
+
"""Setup Playwright browser"""
|
321 |
+
from playwright.async_api import async_playwright
|
322 |
|
|
|
|
|
|
|
|
|
|
|
323 |
self.playwright = await async_playwright().start()
|
|
|
|
|
324 |
browser_args = [
|
325 |
'--no-sandbox',
|
326 |
'--disable-setuid-sandbox',
|
|
|
329 |
'--disable-features=IsolateOrigins,site-per-process',
|
330 |
]
|
331 |
|
332 |
+
if self.stealth:
|
333 |
browser_args.extend([
|
334 |
'--disable-blink-features=AutomationControlled',
|
335 |
'--disable-features=IsolateOrigins'
|
|
|
343 |
if self.use_proxy and self.proxy:
|
344 |
launch_options["proxy"] = {"server": self.proxy}
|
345 |
|
|
|
346 |
self.browser = await self.playwright.chromium.launch(**launch_options)
|
347 |
|
|
|
348 |
context_options = {
|
349 |
"viewport": {"width": 1920, "height": 1080},
|
350 |
"user_agent": get_random_user_agent(),
|
|
|
353 |
"accept_downloads": True
|
354 |
}
|
355 |
|
|
|
356 |
self.context = await self.browser.new_context(**context_options)
|
357 |
|
358 |
+
# Apply stealth features
|
359 |
+
if self.stealth:
|
360 |
await self.context.add_init_script("""
|
361 |
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
362 |
Object.defineProperty(navigator, 'plugins', {
|
|
|
366 |
window.chrome = { runtime: {} };
|
367 |
""")
|
368 |
|
|
|
369 |
self.page = await self.context.new_page()
|
370 |
+
return self.page
|
371 |
+
|
372 |
+
async def setup_pyppeteer(self):
|
373 |
+
"""Setup Pyppeteer browser"""
|
374 |
+
from pyppeteer import launch
|
375 |
+
|
376 |
+
browser_args = [
|
377 |
+
'--no-sandbox',
|
378 |
+
'--disable-setuid-sandbox',
|
379 |
+
'--disable-dev-shm-usage',
|
380 |
+
'--disable-web-security',
|
381 |
+
]
|
382 |
+
|
383 |
+
if self.stealth:
|
384 |
+
browser_args.extend([
|
385 |
+
'--disable-blink-features=AutomationControlled',
|
386 |
+
'--disable-features=IsolateOrigins'
|
387 |
+
])
|
388 |
+
|
389 |
+
launch_options = {
|
390 |
+
"headless": True,
|
391 |
+
"args": browser_args,
|
392 |
+
"ignoreHTTPSErrors": True,
|
393 |
+
"userDataDir": tempfile.mkdtemp()
|
394 |
+
}
|
395 |
+
|
396 |
+
if self.use_proxy and self.proxy:
|
397 |
+
browser_args.append(f'--proxy-server={self.proxy}')
|
398 |
+
|
399 |
+
self.browser = await launch(launch_options)
|
400 |
+
self.page = await self.browser.newPage()
|
401 |
+
|
402 |
+
# Set user agent
|
403 |
+
await self.page.setUserAgent(get_random_user_agent())
|
404 |
+
|
405 |
+
# Set viewport
|
406 |
+
await self.page.setViewport({"width": 1920, "height": 1080})
|
407 |
+
|
408 |
+
# Apply stealth features
|
409 |
+
if self.stealth:
|
410 |
+
await self.page.evaluateOnNewDocument("""
|
411 |
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
412 |
+
Object.defineProperty(navigator, 'plugins', {
|
413 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
|
414 |
+
});
|
415 |
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
416 |
+
window.chrome = { runtime: {} };
|
417 |
+
""")
|
418 |
+
|
419 |
+
return self.page
|
420 |
+
|
421 |
+
async def setup_splash(self):
|
422 |
+
"""Setup Splash browser through API"""
|
423 |
+
# Splash is typically used via HTTP API
|
424 |
+
# We'll use requests for this
|
425 |
+
self.splash_url = "http://localhost:8050/render.html"
|
426 |
+
return None # No actual page object for Splash
|
427 |
+
|
428 |
+
async def goto(self, url, wait_until=None, timeout=30000):
|
429 |
+
"""Navigate to a URL"""
|
430 |
+
if self.engine == "playwright":
|
431 |
+
return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout)
|
432 |
+
elif self.engine == "pyppeteer":
|
433 |
+
return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout)
|
434 |
+
elif self.engine == "splash":
|
435 |
+
# Use Splash HTTP API
|
436 |
+
params = {
|
437 |
+
"url": url,
|
438 |
+
"wait": min(timeout/1000, 30), # Splash uses seconds
|
439 |
+
"timeout": min(timeout/1000, 60),
|
440 |
+
"resource_timeout": min(timeout/1000, 30),
|
441 |
+
"html": 1,
|
442 |
+
"png": 0,
|
443 |
+
"render_all": 1
|
444 |
+
}
|
445 |
+
|
446 |
+
if self.use_proxy and self.proxy:
|
447 |
+
params["proxy"] = self.proxy
|
448 |
+
|
449 |
+
headers = {"User-Agent": get_random_user_agent()}
|
450 |
+
response = requests.get(self.splash_url, params=params, headers=headers)
|
451 |
+
self.last_html = response.text
|
452 |
+
return response
|
453 |
+
|
454 |
+
async def content(self):
|
455 |
+
"""Get page content"""
|
456 |
+
if self.engine == "playwright":
|
457 |
+
return await self.page.content()
|
458 |
+
elif self.engine == "pyppeteer":
|
459 |
+
return await self.page.content()
|
460 |
+
elif self.engine == "splash":
|
461 |
+
return self.last_html
|
462 |
+
|
463 |
+
async def close(self):
|
464 |
+
"""Close browser"""
|
465 |
+
if self.engine == "playwright":
|
466 |
+
if self.browser:
|
467 |
+
await self.browser.close()
|
468 |
+
if self.playwright:
|
469 |
+
await self.playwright.stop()
|
470 |
+
elif self.engine == "pyppeteer":
|
471 |
+
if self.browser:
|
472 |
+
await self.browser.close()
|
473 |
+
# No cleanup needed for Splash as it's stateless
|
474 |
+
|
475 |
+
# -------------------- Download Manager Class --------------------
|
476 |
+
class DownloadManager:
|
477 |
+
def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
|
478 |
+
self.browser_engine = browser_engine
|
479 |
+
self.use_proxy = use_proxy
|
480 |
+
self.proxy = proxy
|
481 |
+
self.query = query
|
482 |
+
self.num_results = num_results
|
483 |
+
self.use_stealth = use_stealth
|
484 |
+
self.browser = None
|
485 |
+
self.network_interceptor = None
|
486 |
+
|
487 |
+
# Configure network interception if enabled
|
488 |
+
if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
|
489 |
+
self.network_interceptor = NetworkInterceptor(
|
490 |
+
intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"],
|
491 |
+
save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"]
|
492 |
+
)
|
493 |
+
|
494 |
+
async def __aenter__(self):
|
495 |
+
# Initialize multi-engine browser
|
496 |
+
self.browser = MultiEngineBrowser(
|
497 |
+
engine=self.browser_engine,
|
498 |
+
use_proxy=self.use_proxy,
|
499 |
+
proxy=self.proxy,
|
500 |
+
stealth=self.use_stealth
|
501 |
+
)
|
502 |
+
self.page = await self.browser.setup()
|
503 |
+
|
504 |
+
# Set headers for better stealth
|
505 |
+
if self.browser_engine == "playwright":
|
506 |
+
await self.page.set_extra_http_headers({
|
507 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
508 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
509 |
+
'DNT': '1',
|
510 |
+
'Referer': 'https://www.google.com/',
|
511 |
+
'Sec-Fetch-Dest': 'document',
|
512 |
+
'Sec-Fetch-Mode': 'navigate',
|
513 |
+
'Sec-Fetch-Site': 'cross-site',
|
514 |
+
'Sec-Fetch-User': '?1',
|
515 |
+
'Upgrade-Insecure-Requests': '1'
|
516 |
+
})
|
517 |
|
518 |
return self
|
519 |
|
520 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
521 |
+
await self.browser.close()
|
|
|
|
|
|
|
522 |
|
523 |
+
async def search_web(self, search_engine="bing"):
|
524 |
+
"""Search web using specified search engine"""
|
525 |
urls = []
|
526 |
try:
|
527 |
+
if search_engine == "bing":
|
528 |
+
search_url = f"https://www.bing.com/search?q={self.query}"
|
529 |
+
elif search_engine == "google":
|
530 |
+
search_url = f"https://www.google.com/search?q={self.query}"
|
531 |
+
else:
|
532 |
+
raise ValueError(f"Unsupported search engine: {search_engine}")
|
533 |
+
|
534 |
+
await self.browser.goto(search_url, timeout=30000)
|
535 |
|
536 |
+
if self.browser_engine == "playwright":
|
537 |
+
if search_engine == "bing":
|
538 |
+
links = await self.page.query_selector_all("li.b_algo h2 a")
|
539 |
+
for link in links[:self.num_results]:
|
540 |
+
href = await link.get_attribute('href')
|
541 |
+
if href:
|
542 |
+
urls.append(href)
|
543 |
+
elif search_engine == "google":
|
544 |
+
links = await self.page.query_selector_all("div.g a[href^='http']")
|
545 |
+
for link in links[:self.num_results]:
|
546 |
+
href = await link.get_attribute('href')
|
547 |
+
if href:
|
548 |
+
urls.append(href)
|
549 |
+
elif self.browser_engine == "pyppeteer":
|
550 |
+
if search_engine == "bing":
|
551 |
+
links = await self.page.querySelectorAll("li.b_algo h2 a")
|
552 |
+
for link in links[:self.num_results]:
|
553 |
+
href = await self.page.evaluate('el => el.getAttribute("href")', link)
|
554 |
+
if href:
|
555 |
+
urls.append(href)
|
556 |
+
elif search_engine == "google":
|
557 |
+
links = await self.page.querySelectorAll("div.g a[href^='http']")
|
558 |
+
for link in links[:self.num_results]:
|
559 |
+
href = await self.page.evaluate('el => el.getAttribute("href")', link)
|
560 |
+
if href:
|
561 |
+
urls.append(href)
|
562 |
+
elif self.browser_engine == "splash":
|
563 |
+
# Parse the HTML with BeautifulSoup
|
564 |
+
soup = BeautifulSoup(self.browser.last_html, 'html.parser')
|
565 |
+
if search_engine == "bing":
|
566 |
+
links = soup.select("li.b_algo h2 a")
|
567 |
+
for link in links[:self.num_results]:
|
568 |
+
href = link.get("href")
|
569 |
+
if href:
|
570 |
+
urls.append(href)
|
571 |
+
elif search_engine == "google":
|
572 |
+
links = soup.select("div.g a[href^='http']")
|
573 |
+
for link in links[:self.num_results]:
|
574 |
+
href = link.get("href")
|
575 |
+
if href:
|
576 |
+
urls.append(href)
|
577 |
|
578 |
return urls
|
579 |
except Exception as e:
|
580 |
+
logger.error(f"Error searching web: {e}")
|
581 |
return []
|
582 |
|
583 |
async def get_file_size(self, url):
|
|
|
584 |
try:
|
585 |
headers = {'User-Agent': get_random_user_agent()}
|
586 |
response = requests.head(url, headers=headers, timeout=15)
|
|
|
593 |
return "Unknown Size"
|
594 |
|
595 |
async def get_pdf_metadata(self, url):
|
|
|
|
|
|
|
|
|
596 |
try:
|
597 |
headers = {'User-Agent': get_random_user_agent()}
|
598 |
response = requests.get(url, headers=headers, timeout=15, stream=True)
|
|
|
610 |
return {}
|
611 |
|
612 |
async def extract_real_download_url(self, url):
|
|
|
613 |
try:
|
614 |
headers = {'User-Agent': get_random_user_agent()}
|
615 |
response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
|
|
|
619 |
return url
|
620 |
|
621 |
async def get_edu_exam_links(self, url):
|
622 |
+
"""Specialized method for educational exam websites that follows a common pattern."""
|
623 |
try:
|
624 |
logger.info(f"Fetching exam links from {url}")
|
625 |
links = set()
|
|
|
630 |
response = requests.get(url, headers=headers, timeout=30)
|
631 |
|
632 |
if response.status_code == 200:
|
633 |
+
# Parse with BeautifulSoup for efficiency
|
634 |
soup = BeautifulSoup(response.text, "html.parser")
|
635 |
parsed_base = urlparse(url)
|
636 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
640 |
href = a["href"]
|
641 |
full_url = urljoin(url, href)
|
642 |
|
643 |
+
# Look for text clues
|
644 |
link_text = a.get_text().lower()
|
645 |
|
646 |
+
# Special patterns for exam sites (expanded list)
|
647 |
url_patterns = [
|
648 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
649 |
"/test/", "/download/", "/files/", "/assignments/",
|
650 |
+
"paper_", "question_", "exam_", "test_", "past_",
|
651 |
+
"assignment_", "sample_", "study_material", "notes_",
|
652 |
+
"/resource/", "/subject/", "/course/", "/material/"
|
653 |
]
|
654 |
|
655 |
text_patterns = [
|
656 |
"exam", "paper", "test", "question", "past", "download",
|
657 |
+
"assignment", "sample", "study", "material", "notes",
|
658 |
+
"subject", "course", "resource", "pdf", "document",
|
659 |
+
"view", "open", "get", "solution", "answer"
|
660 |
]
|
661 |
|
662 |
+
# Check URL and text patterns
|
663 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
664 |
any(pattern in link_text for pattern in text_patterns) or \
|
665 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
|
|
667 |
except Exception as e:
|
668 |
logger.warning(f"Request-based extraction failed: {e}")
|
669 |
|
670 |
+
# Use browser-based approach if needed
|
671 |
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
672 |
logger.info("Using browser for enhanced link extraction")
|
673 |
|
674 |
+
# Navigate to the page
|
675 |
+
await self.browser.goto(url, timeout=45000)
|
676 |
|
677 |
+
# Get page content and parse with BeautifulSoup
|
678 |
+
content = await self.browser.content()
|
679 |
soup = BeautifulSoup(content, "html.parser")
|
680 |
parsed_base = urlparse(url)
|
681 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
682 |
|
683 |
+
# Process all links on the page
|
684 |
for a in soup.find_all("a", href=True):
|
685 |
href = a["href"]
|
686 |
full_url = urljoin(url, href)
|
687 |
link_text = a.get_text().lower()
|
688 |
|
689 |
+
# Apply the same filtering criteria
|
690 |
url_patterns = [
|
691 |
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
692 |
"/test/", "/download/", "/files/", "/assignments/",
|
693 |
+
"paper_", "question_", "exam_", "test_", "past_",
|
694 |
+
"assignment_", "sample_", "study_material", "notes_",
|
695 |
+
"/resource/", "/subject/", "/course/", "/material/"
|
696 |
]
|
697 |
|
698 |
text_patterns = [
|
699 |
"exam", "paper", "test", "question", "past", "download",
|
700 |
+
"assignment", "sample", "study", "material", "notes",
|
701 |
+
"subject", "course", "resource", "pdf", "document",
|
702 |
+
"view", "open", "get", "solution", "answer"
|
703 |
]
|
704 |
|
705 |
+
# Check URL and text patterns
|
706 |
if any(pattern in full_url.lower() for pattern in url_patterns) or \
|
707 |
any(pattern in link_text for pattern in text_patterns) or \
|
708 |
any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
709 |
links.add(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
|
711 |
+
# Filter to likely exam documents
|
712 |
filtered_links = []
|
713 |
for link in links:
|
714 |
# Common file extensions
|
|
|
719 |
# Common paths for exam documents
|
720 |
if any(pattern in link.lower() for pattern in [
|
721 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
722 |
+
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
|
723 |
+
"/resource/", "/material/", "/notes/", "/subjectmaterial/"
|
724 |
]):
|
725 |
filtered_links.append(link)
|
726 |
|
|
|
732 |
return []
|
733 |
|
734 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
|
|
735 |
found_files = []
|
736 |
try:
|
737 |
# Special handling for educational exam sites
|
|
|
765 |
|
766 |
# Get metadata for PDFs
|
767 |
meta = {}
|
768 |
+
if real_url.lower().endswith('.pdf'):
|
769 |
try:
|
770 |
meta = await self.get_pdf_metadata(real_url)
|
771 |
except Exception:
|
|
|
776 |
'filename': filename,
|
777 |
'size': size_str,
|
778 |
'metadata': meta,
|
779 |
+
'source_url': url # Add source URL for better tracking
|
780 |
})
|
781 |
|
782 |
# If we found exam files with the specialized method, return them
|
783 |
if found_files:
|
784 |
return found_files
|
785 |
|
786 |
+
# Standard extraction method for all pages
|
787 |
+
await self.browser.goto(url, timeout=30000)
|
788 |
|
789 |
# Get page content
|
790 |
+
content = await self.browser.content()
|
791 |
soup = BeautifulSoup(content, 'html.parser')
|
792 |
|
793 |
# Define file extensions to look for
|
|
|
807 |
|
808 |
# Handle PHP and download links separately
|
809 |
if '.php' in href.lower() or 'download' in href.lower():
|
810 |
+
full_url = href if href.startswith('http') else urljoin(base_url, href)
|
811 |
real_url = await self.extract_real_download_url(full_url)
|
812 |
if real_url and real_url != full_url:
|
813 |
filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
|
|
|
822 |
|
823 |
# Check for direct file extensions
|
824 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
825 |
+
file_url = href if href.startswith('http') else urljoin(base_url, href)
|
826 |
size_str = await self.get_file_size(file_url)
|
827 |
meta = {}
|
828 |
+
if file_url.lower().endswith('.pdf'):
|
829 |
meta = await self.get_pdf_metadata(file_url)
|
830 |
found_files.append({
|
831 |
'url': file_url,
|
|
|
845 |
break
|
846 |
|
847 |
if file_id:
|
848 |
+
# Determine if it's a view-only file
|
849 |
is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
|
850 |
|
851 |
filename = f"gdrive_{file_id}"
|
|
|
869 |
for elem in soup.find_all(elem_tag):
|
870 |
src = elem.get('src') or elem.get('data')
|
871 |
if src and any(src.lower().endswith(ext) for ext in all_exts):
|
872 |
+
file_url = src if src.startswith('http') else urljoin(base_url, src)
|
873 |
found_files.append({
|
874 |
'url': file_url,
|
875 |
'filename': os.path.basename(file_url.split('?')[0]),
|
|
|
893 |
return []
|
894 |
|
895 |
async def download_file(self, file_info, save_dir, referer=None):
|
896 |
+
"""Download a file and provide a direct download link"""
|
897 |
file_url = file_info['url']
|
898 |
fname = file_info['filename']
|
899 |
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
900 |
|
901 |
+
# Create unique filename to avoid overwriting
|
902 |
path = os.path.join(save_dir, fname)
|
903 |
base, ext = os.path.splitext(fname)
|
904 |
counter = 1
|
|
|
911 |
try:
|
912 |
# Special handling for Google Drive files
|
913 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
914 |
+
# For view-only Google Drive files, use specialized method
|
915 |
is_view_only = file_info.get('metadata', {}).get('view_only', False)
|
916 |
if is_view_only:
|
917 |
result_path = await self.download_viewonly_google_drive(file_info, path)
|
|
|
967 |
return None
|
968 |
|
969 |
async def download_viewonly_google_drive(self, file_info, save_path):
|
970 |
+
"""Download view-only Google Drive documents"""
|
971 |
try:
|
972 |
# Extract file ID
|
973 |
file_id = file_info.get('metadata', {}).get('file_id')
|
|
|
993 |
|
994 |
logger.info(f"Downloading view-only Google Drive file: {file_id}")
|
995 |
|
996 |
+
# Create a dedicated browser session
|
997 |
+
if self.browser_engine == "playwright":
|
998 |
+
from playwright.async_api import async_playwright
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
999 |
|
1000 |
+
async with async_playwright() as p:
|
1001 |
+
browser = await p.chromium.launch(
|
1002 |
+
headless=True,
|
1003 |
+
args=[
|
1004 |
+
'--no-sandbox',
|
1005 |
+
'--disable-setuid-sandbox',
|
1006 |
+
'--disable-dev-shm-usage',
|
1007 |
+
'--disable-web-security',
|
1008 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
1009 |
+
'--disable-site-isolation-trials',
|
1010 |
+
'--disable-blink-features=AutomationControlled'
|
1011 |
+
]
|
1012 |
+
)
|
1013 |
+
|
1014 |
+
# Create context with options for better handling
|
1015 |
+
context = await browser.new_context(
|
1016 |
+
viewport={'width': 1600, 'height': 1200},
|
1017 |
+
user_agent=get_random_user_agent(),
|
1018 |
+
accept_downloads=True,
|
1019 |
+
ignore_https_errors=True
|
1020 |
+
)
|
1021 |
+
|
1022 |
+
# Add stealth script
|
1023 |
+
await context.add_init_script("""
|
1024 |
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
1025 |
+
Object.defineProperty(navigator, 'plugins', {
|
1026 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
|
1027 |
+
});
|
1028 |
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
1029 |
+
window.chrome = { runtime: {} };
|
1030 |
+
""")
|
1031 |
|
1032 |
+
page = await context.new_page()
|
|
|
1033 |
|
1034 |
+
try:
|
1035 |
+
# Visit the file
|
1036 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
|
1037 |
+
await page.wait_for_load_state('networkidle')
|
|
|
1038 |
|
1039 |
+
# Wait for content to load
|
1040 |
+
await page.wait_for_timeout(5000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1041 |
|
1042 |
+
# Create temporary directory for processing
|
1043 |
+
temp_dir = tempfile.mkdtemp()
|
1044 |
|
1045 |
+
# For PDF handling
|
1046 |
+
if file_type == 'pdf':
|
1047 |
+
# Create directory for screenshots
|
1048 |
+
screenshots_dir = os.path.join(temp_dir, "screenshots")
|
1049 |
+
os.makedirs(screenshots_dir, exist_ok=True)
|
1050 |
+
|
1051 |
+
# Get page count
|
1052 |
+
total_pages = await page.evaluate("""
|
1053 |
+
() => {
|
1054 |
+
// Look for page counters in the interface
|
1055 |
+
const pageCounters = document.querySelectorAll('*');
|
1056 |
+
for (const el of pageCounters) {
|
1057 |
+
const text = el.textContent || '';
|
1058 |
+
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
|
1059 |
+
if (match && match[2]) {
|
1060 |
+
return parseInt(match[2]);
|
1061 |
+
}
|
1062 |
+
}
|
1063 |
+
|
1064 |
+
// Look for paginated pages
|
1065 |
+
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
|
1066 |
+
if (pages.length > 0) return pages.length;
|
1067 |
+
|
1068 |
+
// Default if we can't determine
|
1069 |
+
return 20;
|
1070 |
+
}
|
1071 |
+
""")
|
1072 |
+
|
1073 |
+
logger.info(f"PDF has approximately {total_pages} pages")
|
1074 |
+
|
1075 |
+
# Take screenshots of each page
|
1076 |
+
screenshots = []
|
1077 |
+
|
1078 |
+
# First try with the page element method
|
1079 |
+
for i in range(min(total_pages, 100)): # Limit to 100 pages for safety
|
1080 |
+
try:
|
1081 |
+
# Navigate to specific page
|
1082 |
+
if i > 0:
|
1083 |
+
await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()")
|
1084 |
+
await page.wait_for_timeout(500)
|
1085 |
+
|
1086 |
+
# Wait for the page to render
|
1087 |
+
await page.wait_for_timeout(500)
|
1088 |
+
|
1089 |
+
# Take screenshot
|
1090 |
+
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
|
1091 |
+
|
1092 |
+
# Try to find the page element
|
1093 |
+
page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})')
|
1094 |
+
if page_element:
|
1095 |
+
await page_element.screenshot(path=screenshot_path)
|
1096 |
else:
|
1097 |
+
# Fallback to viewport screenshot
|
1098 |
+
await page.screenshot(path=screenshot_path)
|
1099 |
+
|
1100 |
+
screenshots.append(screenshot_path)
|
1101 |
+
|
1102 |
+
# Check if we should continue to next page
|
1103 |
+
if i < total_pages - 1:
|
1104 |
+
next_button = await page.query_selector('button[aria-label="Next page"]')
|
1105 |
+
if next_button:
|
1106 |
+
# Check if button is disabled
|
1107 |
+
is_disabled = await next_button.get_attribute('disabled')
|
1108 |
+
if is_disabled:
|
1109 |
+
logger.info(f"Reached last page at page {i+1}")
|
1110 |
+
break
|
1111 |
+
|
1112 |
+
# Click next page
|
1113 |
+
await next_button.click()
|
1114 |
+
await page.wait_for_timeout(1000)
|
1115 |
+
else:
|
1116 |
+
logger.info("Next page button not found")
|
1117 |
+
break
|
1118 |
+
except Exception as e:
|
1119 |
+
logger.error(f"Error capturing page {i+1}: {e}")
|
1120 |
+
continue
|
1121 |
+
|
1122 |
+
# Create PDF from screenshots
|
1123 |
+
if screenshots:
|
1124 |
+
# Get dimensions from first screenshot
|
1125 |
+
first_img = Image.open(screenshots[0])
|
1126 |
+
width, height = first_img.size
|
1127 |
|
1128 |
+
# Create PDF
|
1129 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
1130 |
+
for screenshot in screenshots:
|
1131 |
+
c.drawImage(screenshot, 0, 0, width, height)
|
1132 |
+
c.showPage()
|
1133 |
+
c.save()
|
1134 |
|
1135 |
+
# Clean up screenshots
|
1136 |
+
for screenshot in screenshots:
|
1137 |
+
os.remove(screenshot)
|
|
|
|
|
|
|
1138 |
|
1139 |
+
# Clean up temp directory
|
1140 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
1141 |
+
|
1142 |
+
return save_path
|
1143 |
+
else:
|
1144 |
+
logger.error("No screenshots captured")
|
1145 |
+
else:
|
1146 |
+
# For non-PDF files, just take a screenshot
|
1147 |
+
screenshot_path = os.path.join(temp_dir, "file.png")
|
1148 |
+
await page.screenshot(path=screenshot_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1149 |
|
1150 |
+
# Copy to destination
|
1151 |
+
shutil.copy(screenshot_path, save_path)
|
|
|
1152 |
|
1153 |
+
# Clean up
|
1154 |
+
os.remove(screenshot_path)
|
1155 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
1156 |
|
1157 |
return save_path
|
1158 |
+
finally:
|
1159 |
+
await browser.close()
|
1160 |
+
elif self.browser_engine == "pyppeteer":
|
1161 |
+
# Similar implementation for Pyppeteer
|
1162 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1163 |
|
1164 |
return None
|
1165 |
except Exception as e:
|
|
|
1167 |
return None
|
1168 |
|
1169 |
async def get_sublinks(self, url, limit=10000):
|
1170 |
+
"""Extract all sublinks from a website"""
|
1171 |
links = set()
|
1172 |
try:
|
1173 |
logger.info(f"Extracting sublinks from {url}")
|
|
|
1183 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
1184 |
return list(links)[:limit]
|
1185 |
|
1186 |
+
# Standard link extraction for all sites
|
1187 |
+
await self.browser.goto(url, timeout=30000)
|
1188 |
|
1189 |
# Get page content
|
1190 |
+
content = await self.browser.content()
|
1191 |
soup = BeautifulSoup(content, 'html.parser')
|
1192 |
|
1193 |
+
# Get base URL for resolving relative links
|
1194 |
parsed_base = urlparse(url)
|
1195 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
1196 |
|
1197 |
+
# Extract all links from the page
|
1198 |
for a in soup.find_all('a', href=True):
|
1199 |
href = a['href']
|
1200 |
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
|
|
1220 |
logger.error(f"Error extracting sublinks: {e}")
|
1221 |
return list(links)[:limit]
|
1222 |
|
1223 |
+
@celery_app.task
|
1224 |
+
def download_file_task(file_info, save_dir, referer=None):
|
1225 |
+
"""Celery task for downloading files asynchronously"""
|
1226 |
+
# This function runs in a separate worker process
|
1227 |
+
file_url = file_info['url']
|
1228 |
+
fname = file_info['filename']
|
1229 |
+
referer = referer or file_info.get('source_url', 'https://www.google.com')
|
1230 |
+
|
1231 |
+
# Create unique filename
|
1232 |
+
path = os.path.join(save_dir, fname)
|
1233 |
+
base, ext = os.path.splitext(fname)
|
1234 |
+
counter = 1
|
1235 |
+
while os.path.exists(path):
|
1236 |
+
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
1237 |
+
counter += 1
|
1238 |
+
|
1239 |
+
os.makedirs(save_dir, exist_ok=True)
|
1240 |
+
|
1241 |
+
try:
|
1242 |
+
# Handle Google Drive files
|
1243 |
+
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
1244 |
+
# Extract file ID
|
1245 |
+
file_id = None
|
1246 |
+
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
1247 |
+
match = re.search(pattern, file_url)
|
1248 |
+
if match:
|
1249 |
+
file_id = match.group(1)
|
1250 |
+
break
|
1251 |
+
|
1252 |
+
if file_id:
|
1253 |
+
# Try direct download
|
1254 |
+
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
|
1255 |
+
headers = {
|
1256 |
+
'User-Agent': get_random_user_agent(),
|
1257 |
+
'Referer': referer
|
1258 |
+
}
|
1259 |
+
|
1260 |
+
with requests.get(download_url, headers=headers, stream=True) as r:
|
1261 |
+
if r.status_code == 200:
|
1262 |
+
with open(path, 'wb') as f:
|
1263 |
+
for chunk in r.iter_content(chunk_size=8192):
|
1264 |
+
f.write(chunk)
|
1265 |
+
|
1266 |
+
# Check if this is HTML (common for Google Drive restrictions)
|
1267 |
+
with open(path, 'rb') as f:
|
1268 |
+
content_start = f.read(100).decode('utf-8', errors='ignore')
|
1269 |
+
if '<html' in content_start.lower():
|
1270 |
+
os.remove(path)
|
1271 |
+
return {'status': 'error', 'message': 'Received HTML instead of file'}
|
1272 |
+
|
1273 |
+
return {'status': 'success', 'path': path}
|
1274 |
+
|
1275 |
+
# Standard download for regular files
|
1276 |
+
headers = {
|
1277 |
+
'User-Agent': get_random_user_agent(),
|
1278 |
+
'Referer': referer,
|
1279 |
+
'Accept': '*/*',
|
1280 |
+
'Accept-Encoding': 'gzip, deflate, br'
|
1281 |
+
}
|
1282 |
+
|
1283 |
+
with requests.get(file_url, headers=headers, stream=True) as r:
|
1284 |
+
if r.status_code == 200:
|
1285 |
+
with open(path, 'wb') as f:
|
1286 |
+
for chunk in r.iter_content(chunk_size=8192):
|
1287 |
+
f.write(chunk)
|
1288 |
+
|
1289 |
+
return {'status': 'success', 'path': path}
|
1290 |
+
else:
|
1291 |
+
return {'status': 'error', 'message': f"HTTP error: {r.status_code}"}
|
1292 |
+
|
1293 |
+
except Exception as e:
|
1294 |
+
return {'status': 'error', 'message': str(e)}
|
1295 |
+
|
1296 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
1297 |
+
"""Perform deep search for files on a website and its subpages"""
|
1298 |
if not custom_ext_list:
|
1299 |
custom_ext_list = []
|
1300 |
|
1301 |
+
# Create progress indicators
|
1302 |
progress_text = st.empty()
|
1303 |
progress_bar = st.progress(0)
|
1304 |
file_count_text = st.empty()
|
|
|
1317 |
total_links = len(sublinks)
|
1318 |
progress_text.text(f"Found {total_links} sublinks to process")
|
1319 |
|
1320 |
+
# Initialize all_files with main_files to ensure they're included
|
1321 |
all_files = main_files.copy()
|
1322 |
|
1323 |
+
# Process each sublink
|
1324 |
+
for i, sublink in enumerate(sublinks, 1):
|
1325 |
+
progress = i / max(total_links, 1) # Avoid division by zero
|
1326 |
+
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
1327 |
+
progress_bar.progress(progress)
|
1328 |
+
|
1329 |
+
try:
|
1330 |
+
# Extract files from sublink
|
1331 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
1332 |
+
all_files.extend(sub_files)
|
1333 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
1334 |
+
except Exception as e:
|
1335 |
+
logger.warning(f"Error processing sublink {sublink}: {e}")
|
|
|
1336 |
|
1337 |
# Deduplicate files
|
1338 |
seen_urls = set()
|
|
|
1360 |
progress_text.empty()
|
1361 |
progress_bar.empty()
|
1362 |
|
1363 |
+
# -------------------- Main App --------------------
|
1364 |
def main():
|
1365 |
st.title("Advanced File Downloader")
|
1366 |
|
|
|
1369 |
st.session_state.initialized = True
|
1370 |
st.session_state.discovered_files = []
|
1371 |
st.session_state.current_url = None
|
1372 |
+
st.session_state.google_creds = None
|
1373 |
st.session_state.selected_files = []
|
1374 |
st.session_state.do_deep_search = False
|
1375 |
st.session_state.deep_search_url = None
|
1376 |
st.session_state.search_results = []
|
1377 |
st.session_state.download_urls = {} # For direct download links
|
1378 |
|
1379 |
+
# Install dependencies if needed
|
1380 |
+
if "dependencies_installed" not in st.session_state:
|
1381 |
+
with st.spinner("Setting up dependencies. This may take a minute..."):
|
1382 |
+
st.session_state.dependencies_installed = setup_dependencies()
|
1383 |
+
check_services()
|
1384 |
|
1385 |
+
# Sidebar options
|
1386 |
with st.sidebar:
|
1387 |
+
mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select")
|
1388 |
|
1389 |
+
with st.expander("Search Options", expanded=True):
|
1390 |
+
search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine")
|
1391 |
+
browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine")
|
1392 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
|
1393 |
help="Enter extensions like .csv, .txt")
|
1394 |
max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
|
1395 |
sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
|
1396 |
+
|
1397 |
+
with st.expander("Advanced Options", expanded=False):
|
1398 |
use_proxy = st.checkbox("Use Proxy", key="use_proxy")
|
1399 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1400 |
use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
|
1401 |
help="Makes browser harder to detect as automated")
|
1402 |
+
enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"],
|
1403 |
+
key="enable_intercept",
|
1404 |
+
help="Intercept network traffic to find additional files")
|
1405 |
+
if enable_network_intercept:
|
1406 |
+
NETWORK_INTERCEPTOR_CONFIG["enabled"] = True
|
1407 |
+
intercept_types = st.multiselect("Intercept Types",
|
1408 |
+
["xhr", "fetch", "document", "media", "stylesheet", "image", "font"],
|
1409 |
+
default=["xhr", "fetch", "document", "media"],
|
1410 |
+
key="intercept_types")
|
1411 |
+
NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types
|
1412 |
+
else:
|
1413 |
+
NETWORK_INTERCEPTOR_CONFIG["enabled"] = False
|
1414 |
|
1415 |
+
with st.expander("Google Drive Integration", expanded=False):
|
1416 |
+
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
1417 |
+
auth_url = get_google_auth_url()
|
1418 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
1419 |
+
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
|
1420 |
+
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
|
1421 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
1422 |
+
st.session_state.google_creds = creds
|
1423 |
+
st.write(msg)
|
|
|
1424 |
|
1425 |
# Main content area
|
1426 |
if mode == "Manual URL":
|
1427 |
st.header("Manual URL Mode")
|
1428 |
url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
|
1429 |
|
1430 |
+
col1, col2 = st.columns([3, 1])
|
1431 |
+
with col1:
|
1432 |
+
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
|
1433 |
+
if url:
|
1434 |
+
# Process custom extensions
|
1435 |
+
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
|
1437 |
+
with st.spinner("Searching for files..."):
|
1438 |
+
async def run_deep_search():
|
1439 |
+
async with DownloadManager(
|
1440 |
+
browser_engine=browser_engine,
|
1441 |
+
use_proxy=use_proxy,
|
1442 |
+
proxy=proxy,
|
1443 |
+
use_stealth=use_stealth
|
1444 |
+
) as dm:
|
1445 |
+
files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
|
1446 |
+
return files
|
1447 |
+
|
1448 |
+
# Run the search
|
1449 |
+
files = asyncio.run(run_deep_search())
|
1450 |
+
|
1451 |
+
if files:
|
1452 |
+
st.session_state.discovered_files = files
|
1453 |
+
st.session_state.current_url = url
|
1454 |
+
st.success(f"Found {len(files)} files!")
|
1455 |
+
else:
|
1456 |
+
st.warning("No files found.")
|
1457 |
|
1458 |
# Display and process discovered files
|
1459 |
if st.session_state.discovered_files:
|
|
|
1482 |
file_info = f"{filename} ({size})"
|
1483 |
|
1484 |
file_options.append((i, file_info))
|
1485 |
+
|
1486 |
+
# Generate direct download URL for this file
|
1487 |
+
if i not in st.session_state.download_urls:
|
1488 |
+
# Generate a unique key for this file
|
1489 |
+
file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode()
|
1490 |
+
st.session_state.download_urls[i] = file_key
|
1491 |
|
1492 |
# File selection multiselect
|
1493 |
selected_indices = st.multiselect(
|
|
|
1500 |
|
1501 |
st.session_state.selected_files = selected_indices
|
1502 |
|
1503 |
+
# Display individual files with direct download links
|
1504 |
if files:
|
1505 |
st.subheader("Available Files")
|
1506 |
for i, file in enumerate(files):
|
|
|
1508 |
st.write(f"Source: {file.get('source_url', 'Unknown')}")
|
1509 |
st.write(f"URL: {file['url']}")
|
1510 |
|
1511 |
+
# Download button for this specific file
|
1512 |
+
if st.button(f"Download this file", key=f"download_single_{i}"):
|
1513 |
with st.spinner(f"Downloading {file['filename']}..."):
|
1514 |
# Create downloads directory
|
1515 |
download_dir = "./downloads"
|
|
|
1518 |
# Download the file
|
1519 |
async def download_single():
|
1520 |
async with DownloadManager(
|
1521 |
+
browser_engine=browser_engine,
|
1522 |
use_proxy=use_proxy,
|
1523 |
proxy=proxy,
|
1524 |
use_stealth=use_stealth
|
|
|
1551 |
if selected_indices:
|
1552 |
st.subheader("Batch Download Options")
|
1553 |
|
1554 |
+
col1, col2, col3, col4 = st.columns(4)
|
1555 |
with col1:
|
1556 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1557 |
with col2:
|
1558 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1559 |
with col3:
|
1560 |
delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
|
1561 |
+
with col4:
|
1562 |
+
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
1563 |
|
1564 |
if st.button("Download Selected Files", key="batch_download_btn"):
|
1565 |
with st.spinner(f"Downloading {len(selected_indices)} files..."):
|
|
|
1573 |
|
1574 |
async def download_batch():
|
1575 |
async with DownloadManager(
|
1576 |
+
browser_engine=browser_engine,
|
1577 |
use_proxy=use_proxy,
|
1578 |
proxy=proxy,
|
1579 |
use_stealth=use_stealth
|
|
|
1614 |
key="download_zip_btn"
|
1615 |
)
|
1616 |
|
1617 |
+
# Upload to Google Drive if requested
|
1618 |
+
if upload_to_drive and st.session_state.google_creds:
|
1619 |
+
with st.spinner("Uploading to Google Drive..."):
|
1620 |
+
drive_service = googleapiclient.discovery.build(
|
1621 |
+
"drive", "v3", credentials=st.session_state.google_creds
|
1622 |
+
)
|
1623 |
+
folder_id = create_drive_folder(
|
1624 |
+
drive_service, f"Downloads_{get_domain(url)}"
|
1625 |
+
)
|
1626 |
+
drive_id = google_drive_upload(
|
1627 |
+
zip_path, st.session_state.google_creds, folder_id
|
1628 |
+
)
|
1629 |
+
|
1630 |
+
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
1631 |
+
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
1632 |
+
else:
|
1633 |
+
st.error(drive_id)
|
1634 |
+
|
1635 |
# Delete original files if requested
|
1636 |
if delete_after:
|
1637 |
for path in downloaded_paths:
|
|
|
1650 |
|
1651 |
if st.button("Search", key="web_search_btn"):
|
1652 |
if query:
|
1653 |
+
with st.spinner("Searching the web..."):
|
1654 |
async def run_search():
|
1655 |
async with DownloadManager(
|
1656 |
+
browser_engine=browser_engine,
|
1657 |
use_proxy=use_proxy,
|
1658 |
proxy=proxy,
|
1659 |
query=query,
|
1660 |
num_results=num_results,
|
1661 |
use_stealth=use_stealth
|
1662 |
) as dm:
|
1663 |
+
urls = await dm.search_web(search_engine)
|
1664 |
return urls
|
1665 |
|
1666 |
urls = asyncio.run(run_search())
|
|
|
1693 |
with st.spinner("Searching for files..."):
|
1694 |
async def deep_search_result():
|
1695 |
async with DownloadManager(
|
1696 |
+
browser_engine=browser_engine,
|
1697 |
use_proxy=use_proxy,
|
1698 |
proxy=proxy,
|
1699 |
use_stealth=use_stealth
|
|
|
1709 |
else:
|
1710 |
st.warning("No files found on this page.")
|
1711 |
|
1712 |
+
elif mode == "Single File":
|
1713 |
+
st.header("Single File Download")
|
1714 |
|
1715 |
# View-only Google Drive download
|
1716 |
+
with st.expander("Download View-Only Google Drive Document", expanded=True):
|
1717 |
+
st.write("Download protected/view-only Google Drive documents")
|
1718 |
+
|
1719 |
+
file_id = st.text_input(
|
1720 |
+
"Google Drive File ID",
|
1721 |
+
placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
|
1722 |
+
key="drive_file_id"
|
1723 |
+
)
|
1724 |
+
|
1725 |
+
if st.button("Download Document", key="drive_download_btn") and file_id:
|
1726 |
+
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
1727 |
+
# Create download directory
|
1728 |
+
download_dir = "./downloads"
|
1729 |
+
os.makedirs(download_dir, exist_ok=True)
|
1730 |
+
|
1731 |
+
# Set output path
|
1732 |
+
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
|
1733 |
+
|
1734 |
+
# Download the file
|
1735 |
+
async def download_drive_file():
|
1736 |
+
async with DownloadManager(
|
1737 |
+
browser_engine=browser_engine,
|
1738 |
+
use_proxy=use_proxy,
|
1739 |
+
proxy=proxy,
|
1740 |
+
use_stealth=use_stealth
|
1741 |
+
) as dm:
|
1742 |
+
file_info = {
|
1743 |
+
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
1744 |
+
'filename': f"gdrive_{file_id}.pdf",
|
1745 |
+
'metadata': {'file_id': file_id, 'view_only': True}
|
1746 |
+
}
|
1747 |
+
return await dm.download_viewonly_google_drive(file_info, output_path)
|
1748 |
+
|
1749 |
+
result_path = asyncio.run(download_drive_file())
|
1750 |
+
|
1751 |
+
if result_path:
|
1752 |
+
st.success("Document downloaded successfully!")
|
1753 |
+
|
1754 |
+
# Provide download link
|
1755 |
+
with open(result_path, "rb") as f:
|
1756 |
+
file_bytes = f.read()
|
1757 |
+
|
1758 |
+
st.download_button(
|
1759 |
+
label="Download PDF",
|
1760 |
+
data=file_bytes,
|
1761 |
+
file_name=os.path.basename(result_path),
|
1762 |
+
mime="application/pdf",
|
1763 |
+
key="drive_pdf_download"
|
1764 |
+
)
|
1765 |
+
else:
|
1766 |
+
st.error("Failed to download the document. Please check the file ID and try again.")
|
1767 |
|
1768 |
+
# Direct URL download
|
1769 |
+
with st.expander("Download from Direct URL", expanded=True):
|
1770 |
+
st.write("Download a file from a direct URL")
|
1771 |
+
|
1772 |
+
file_url = st.text_input(
|
1773 |
+
"File URL",
|
1774 |
+
placeholder="https://example.com/file.pdf",
|
1775 |
+
key="direct_url"
|
1776 |
+
)
|
1777 |
+
|
1778 |
+
file_name = st.text_input(
|
1779 |
+
"Save as (optional)",
|
1780 |
+
placeholder="Leave blank to use original filename",
|
1781 |
+
key="save_filename"
|
1782 |
+
)
|
1783 |
+
|
1784 |
+
if st.button("Download File", key="direct_download_btn") and file_url:
|
1785 |
+
with st.spinner("Downloading file..."):
|
1786 |
+
# Create download directory
|
1787 |
+
download_dir = "./downloads"
|
1788 |
+
os.makedirs(download_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
1789 |
|
1790 |
+
# Determine filename
|
1791 |
+
if not file_name:
|
1792 |
+
file_name = os.path.basename(urlparse(file_url).path)
|
1793 |
+
if not file_name or file_name == '/':
|
1794 |
+
file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}"
|
1795 |
|
1796 |
+
# Create file info
|
1797 |
+
file_info = {
|
1798 |
+
'url': file_url,
|
1799 |
+
'filename': file_name,
|
1800 |
+
'metadata': {}
|
1801 |
+
}
|
1802 |
+
|
1803 |
+
# Download the file
|
1804 |
+
async def download_direct_file():
|
1805 |
+
async with DownloadManager(
|
1806 |
+
browser_engine=browser_engine,
|
1807 |
+
use_proxy=use_proxy,
|
1808 |
+
proxy=proxy,
|
1809 |
+
use_stealth=use_stealth
|
1810 |
+
) as dm:
|
1811 |
+
return await dm.download_file(file_info, download_dir)
|
1812 |
+
|
1813 |
+
file_path = asyncio.run(download_direct_file())
|
1814 |
+
|
1815 |
+
if file_path:
|
1816 |
+
st.success(f"File downloaded successfully to {file_path}")
|
1817 |
+
|
1818 |
+
# Provide download link
|
1819 |
+
with open(file_path, "rb") as f:
|
1820 |
+
file_bytes = f.read()
|
1821 |
+
|
1822 |
+
mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
|
1823 |
+
|
1824 |
+
st.download_button(
|
1825 |
+
label=f"Download {os.path.basename(file_path)}",
|
1826 |
+
data=file_bytes,
|
1827 |
+
file_name=os.path.basename(file_path),
|
1828 |
+
mime=mime_type,
|
1829 |
+
key="direct_file_download"
|
1830 |
+
)
|
1831 |
+
else:
|
1832 |
+
st.error("Failed to download the file. Please check the URL and try again.")
|
1833 |
|
1834 |
# Footer
|
1835 |
st.markdown("---")
|
1836 |
+
st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies")
|
1837 |
|
1838 |
# Run the app
|
1839 |
if __name__ == "__main__":
|