euler314 commited on
Commit
18121e0
·
verified ·
1 Parent(s): e5712d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +403 -862
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import os
3
  import asyncio
@@ -21,31 +22,32 @@ from PIL import Image
21
  from reportlab.lib.pagesizes import letter
22
  from reportlab.pdfgen import canvas
23
 
24
- # Advanced imports
25
- from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
26
- from bs4 import BeautifulSoup
27
- from PyPDF2 import PdfReader
28
- import google_auth_oauthlib.flow
29
- import googleapiclient.discovery
30
- import google.auth.transport.requests
31
- import googleapiclient.http
32
  import requests
33
- import celery
34
- from celery import Celery
35
- import splash
36
- import pyppeteer
37
- import mitmproxy
38
- from mitmproxy import http
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Configure page and logging
41
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
42
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
43
  logger = logging.getLogger(__name__)
44
 
45
- # Initialize Celery for distributed task processing
46
- celery_app = Celery('file_downloader', broker='redis://localhost:6379/0')
47
-
48
- # Configure Google OAuth
49
  GOOGLE_OAUTH_CONFIG = {
50
  "web": {
51
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
@@ -58,7 +60,7 @@ GOOGLE_OAUTH_CONFIG = {
58
  }
59
  }
60
 
61
- # -------------------- User Agent Settings --------------------
62
  USER_AGENTS = [
63
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
64
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
@@ -66,14 +68,9 @@ USER_AGENTS = [
66
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
67
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
68
  'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
69
- 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
70
  ]
71
 
72
- # -------------------- Proxy Management --------------------
73
- PROXY_POOL = []
74
- CURRENT_PROXY_INDEX = 0
75
-
76
- # -------------------- Network Interception Configuration --------------------
77
  NETWORK_INTERCEPTOR_CONFIG = {
78
  "enabled": False,
79
  "intercept_types": ["xhr", "fetch", "document", "media"],
@@ -81,7 +78,7 @@ NETWORK_INTERCEPTOR_CONFIG = {
81
  "intercept_folder": "./intercepted_data"
82
  }
83
 
84
- # -------------------- Utility Functions --------------------
85
  def get_random_user_agent():
86
  return random.choice(USER_AGENTS)
87
 
@@ -117,8 +114,11 @@ def is_valid_file_url(url, extensions):
117
  """Check if URL is a valid file URL based on extension"""
118
  return any(url.lower().endswith(ext) for ext in extensions)
119
 
120
- # -------------------- Google Drive Functions --------------------
121
  def get_google_auth_url():
 
 
 
122
  client_config = GOOGLE_OAUTH_CONFIG["web"]
123
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
124
  {"web": client_config},
@@ -133,6 +133,9 @@ def get_google_auth_url():
133
  return authorization_url
134
 
135
  def exchange_code_for_credentials(auth_code):
 
 
 
136
  if not auth_code.strip():
137
  return None, "No code provided."
138
  try:
@@ -151,6 +154,9 @@ def exchange_code_for_credentials(auth_code):
151
  return None, f"Error during token exchange: {e}"
152
 
153
  def google_drive_upload(file_path, credentials, folder_id=None):
 
 
 
154
  try:
155
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
156
  file_metadata = {'name': os.path.basename(file_path)}
@@ -163,164 +169,59 @@ def google_drive_upload(file_path, credentials, folder_id=None):
163
  return f"Error uploading to Drive: {str(e)}"
164
 
165
  def create_drive_folder(drive_service, name):
 
 
 
166
  folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
167
  folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
168
  return folder.get('id')
169
 
170
- # -------------------- Setup Functions --------------------
171
- def setup_dependencies():
172
- """Install required system dependencies"""
173
  try:
174
  # Install system dependencies
175
  subprocess.run(['apt-get', 'update', '-y'], check=True)
176
  packages = [
177
  'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
178
  'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
179
- 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0',
180
- 'redis-server', 'python3-dev', 'build-essential'
181
  ]
182
  subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
183
 
184
- # Install Python packages
185
- subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True)
186
-
187
- # Install browsers
188
- subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
189
- subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True)
190
 
191
  st.success("Dependencies installed successfully!")
192
  return True
193
  except Exception as e:
194
  st.error(f"Error installing dependencies: {e}")
195
- st.info("You may need to manually install dependencies. Check console for details.")
196
  logger.error(f"Setup error: {e}")
197
  traceback.print_exc()
198
  return False
199
 
200
- def check_services():
201
- """Check if required services are running"""
202
- try:
203
- # Check Redis for Celery
204
- redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG'
205
- if not redis_running:
206
- # Try to start Redis
207
- subprocess.run(['service', 'redis-server', 'start'], check=True)
208
-
209
- # Create directories for intercepted data
210
- os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True)
211
-
212
- return True
213
- except Exception as e:
214
- logger.error(f"Service check error: {e}")
215
- return False
216
-
217
- # -------------------- Network Interception Classes --------------------
218
- class NetworkInterceptor:
219
- """Class to intercept network traffic using mitmproxy"""
220
-
221
- def __init__(self, intercept_types=None, save_path=None):
222
- self.intercept_types = intercept_types or ["xhr", "fetch", "document"]
223
- self.save_path = save_path or "./intercepted_data"
224
- os.makedirs(self.save_path, exist_ok=True)
225
- self.captured_data = []
226
-
227
- def intercept_request(self, flow):
228
- """Process intercepted requests"""
229
- try:
230
- url = flow.request.url
231
- method = flow.request.method
232
- content_type = flow.request.headers.get("Content-Type", "")
233
-
234
- # Log the request
235
- self.captured_data.append({
236
- "type": "request",
237
- "url": url,
238
- "method": method,
239
- "headers": dict(flow.request.headers),
240
- "timestamp": time.time()
241
- })
242
-
243
- logger.info(f"Intercepted {method} request to {url}")
244
- except Exception as e:
245
- logger.error(f"Error intercepting request: {e}")
246
-
247
- def intercept_response(self, flow):
248
- """Process intercepted responses"""
249
- try:
250
- url = flow.request.url
251
- status_code = flow.response.status_code
252
- content_type = flow.response.headers.get("Content-Type", "")
253
-
254
- # Only process responses of interest based on content type
255
- if any(t in content_type.lower() for t in ["application/pdf", "application/msword",
256
- "application/vnd.openxmlformats",
257
- "application/zip"]):
258
- # Save the file
259
- filename = os.path.basename(urlparse(url).path)
260
- if not filename or filename == '/':
261
- filename = f"file_{int(time.time())}"
262
-
263
- # Try to add extension based on content type
264
- if "pdf" in content_type:
265
- filename += ".pdf"
266
- elif "msword" in content_type:
267
- filename += ".doc"
268
- elif "openxmlformats" in content_type and "wordprocessingml" in content_type:
269
- filename += ".docx"
270
- elif "zip" in content_type:
271
- filename += ".zip"
272
-
273
- file_path = os.path.join(self.save_path, filename)
274
- with open(file_path, "wb") as f:
275
- f.write(flow.response.content)
276
-
277
- logger.info(f"Saved intercepted file: {file_path}")
278
-
279
- # Record metadata about the captured file
280
- self.captured_data.append({
281
- "type": "file",
282
- "url": url,
283
- "content_type": content_type,
284
- "size": len(flow.response.content),
285
- "path": file_path,
286
- "timestamp": time.time()
287
- })
288
- except Exception as e:
289
- logger.error(f"Error intercepting response: {e}")
290
-
291
- def get_captured_files(self):
292
- """Return list of captured files"""
293
- return [item for item in self.captured_data if item["type"] == "file"]
294
-
295
- # -------------------- Browser Automation Classes --------------------
296
- class MultiEngineBrowser:
297
- """Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)"""
298
-
299
- def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True):
300
- self.engine = engine
301
  self.use_proxy = use_proxy
302
  self.proxy = proxy
303
- self.stealth = stealth
 
 
 
304
  self.browser = None
305
  self.context = None
306
  self.page = None
307
-
308
- async def setup(self):
309
- """Initialize browser based on selected engine"""
310
- if self.engine == "playwright":
311
- return await self.setup_playwright()
312
- elif self.engine == "pyppeteer":
313
- return await self.setup_pyppeteer()
314
- elif self.engine == "splash":
315
- return await self.setup_splash()
316
- else:
317
- raise ValueError(f"Unsupported browser engine: {self.engine}")
318
-
319
- async def setup_playwright(self):
320
- """Setup Playwright browser"""
321
- from playwright.async_api import async_playwright
322
 
 
 
 
 
 
323
  self.playwright = await async_playwright().start()
 
 
324
  browser_args = [
325
  '--no-sandbox',
326
  '--disable-setuid-sandbox',
@@ -329,7 +230,7 @@ class MultiEngineBrowser:
329
  '--disable-features=IsolateOrigins,site-per-process',
330
  ]
331
 
332
- if self.stealth:
333
  browser_args.extend([
334
  '--disable-blink-features=AutomationControlled',
335
  '--disable-features=IsolateOrigins'
@@ -343,8 +244,10 @@ class MultiEngineBrowser:
343
  if self.use_proxy and self.proxy:
344
  launch_options["proxy"] = {"server": self.proxy}
345
 
 
346
  self.browser = await self.playwright.chromium.launch(**launch_options)
347
 
 
348
  context_options = {
349
  "viewport": {"width": 1920, "height": 1080},
350
  "user_agent": get_random_user_agent(),
@@ -353,10 +256,10 @@ class MultiEngineBrowser:
353
  "accept_downloads": True
354
  }
355
 
 
356
  self.context = await self.browser.new_context(**context_options)
357
 
358
- # Apply stealth features
359
- if self.stealth:
360
  await self.context.add_init_script("""
361
  Object.defineProperty(navigator, 'webdriver', { get: () => false });
362
  Object.defineProperty(navigator, 'plugins', {
@@ -366,221 +269,50 @@ class MultiEngineBrowser:
366
  window.chrome = { runtime: {} };
367
  """)
368
 
 
369
  self.page = await self.context.new_page()
370
- return self.page
371
-
372
- async def setup_pyppeteer(self):
373
- """Setup Pyppeteer browser"""
374
- from pyppeteer import launch
375
-
376
- browser_args = [
377
- '--no-sandbox',
378
- '--disable-setuid-sandbox',
379
- '--disable-dev-shm-usage',
380
- '--disable-web-security',
381
- ]
382
-
383
- if self.stealth:
384
- browser_args.extend([
385
- '--disable-blink-features=AutomationControlled',
386
- '--disable-features=IsolateOrigins'
387
- ])
388
-
389
- launch_options = {
390
- "headless": True,
391
- "args": browser_args,
392
- "ignoreHTTPSErrors": True,
393
- "userDataDir": tempfile.mkdtemp()
394
- }
395
-
396
- if self.use_proxy and self.proxy:
397
- browser_args.append(f'--proxy-server={self.proxy}')
398
-
399
- self.browser = await launch(launch_options)
400
- self.page = await self.browser.newPage()
401
-
402
- # Set user agent
403
- await self.page.setUserAgent(get_random_user_agent())
404
-
405
- # Set viewport
406
- await self.page.setViewport({"width": 1920, "height": 1080})
407
-
408
- # Apply stealth features
409
- if self.stealth:
410
- await self.page.evaluateOnNewDocument("""
411
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
412
- Object.defineProperty(navigator, 'plugins', {
413
- get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
414
- });
415
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
416
- window.chrome = { runtime: {} };
417
- """)
418
-
419
- return self.page
420
-
421
- async def setup_splash(self):
422
- """Setup Splash browser through API"""
423
- # Splash is typically used via HTTP API
424
- # We'll use requests for this
425
- self.splash_url = "http://localhost:8050/render.html"
426
- return None # No actual page object for Splash
427
-
428
- async def goto(self, url, wait_until=None, timeout=30000):
429
- """Navigate to a URL"""
430
- if self.engine == "playwright":
431
- return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout)
432
- elif self.engine == "pyppeteer":
433
- return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout)
434
- elif self.engine == "splash":
435
- # Use Splash HTTP API
436
- params = {
437
- "url": url,
438
- "wait": min(timeout/1000, 30), # Splash uses seconds
439
- "timeout": min(timeout/1000, 60),
440
- "resource_timeout": min(timeout/1000, 30),
441
- "html": 1,
442
- "png": 0,
443
- "render_all": 1
444
- }
445
-
446
- if self.use_proxy and self.proxy:
447
- params["proxy"] = self.proxy
448
-
449
- headers = {"User-Agent": get_random_user_agent()}
450
- response = requests.get(self.splash_url, params=params, headers=headers)
451
- self.last_html = response.text
452
- return response
453
-
454
- async def content(self):
455
- """Get page content"""
456
- if self.engine == "playwright":
457
- return await self.page.content()
458
- elif self.engine == "pyppeteer":
459
- return await self.page.content()
460
- elif self.engine == "splash":
461
- return self.last_html
462
-
463
- async def close(self):
464
- """Close browser"""
465
- if self.engine == "playwright":
466
- if self.browser:
467
- await self.browser.close()
468
- if self.playwright:
469
- await self.playwright.stop()
470
- elif self.engine == "pyppeteer":
471
- if self.browser:
472
- await self.browser.close()
473
- # No cleanup needed for Splash as it's stateless
474
-
475
- # -------------------- Download Manager Class --------------------
476
- class DownloadManager:
477
- def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
478
- self.browser_engine = browser_engine
479
- self.use_proxy = use_proxy
480
- self.proxy = proxy
481
- self.query = query
482
- self.num_results = num_results
483
- self.use_stealth = use_stealth
484
- self.browser = None
485
- self.network_interceptor = None
486
-
487
- # Configure network interception if enabled
488
- if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
489
- self.network_interceptor = NetworkInterceptor(
490
- intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"],
491
- save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"]
492
- )
493
-
494
- async def __aenter__(self):
495
- # Initialize multi-engine browser
496
- self.browser = MultiEngineBrowser(
497
- engine=self.browser_engine,
498
- use_proxy=self.use_proxy,
499
- proxy=self.proxy,
500
- stealth=self.use_stealth
501
- )
502
- self.page = await self.browser.setup()
503
-
504
- # Set headers for better stealth
505
- if self.browser_engine == "playwright":
506
- await self.page.set_extra_http_headers({
507
- 'Accept-Language': 'en-US,en;q=0.9',
508
- 'Accept-Encoding': 'gzip, deflate, br',
509
- 'DNT': '1',
510
- 'Referer': 'https://www.google.com/',
511
- 'Sec-Fetch-Dest': 'document',
512
- 'Sec-Fetch-Mode': 'navigate',
513
- 'Sec-Fetch-Site': 'cross-site',
514
- 'Sec-Fetch-User': '?1',
515
- 'Upgrade-Insecure-Requests': '1'
516
- })
517
 
518
  return self
519
 
520
  async def __aexit__(self, exc_type, exc_val, exc_tb):
521
- await self.browser.close()
 
 
 
522
 
523
- async def search_web(self, search_engine="bing"):
524
- """Search web using specified search engine"""
525
  urls = []
526
  try:
527
- if search_engine == "bing":
528
- search_url = f"https://www.bing.com/search?q={self.query}"
529
- elif search_engine == "google":
530
- search_url = f"https://www.google.com/search?q={self.query}"
531
- else:
532
- raise ValueError(f"Unsupported search engine: {search_engine}")
533
-
534
- await self.browser.goto(search_url, timeout=30000)
535
 
536
- if self.browser_engine == "playwright":
537
- if search_engine == "bing":
538
- links = await self.page.query_selector_all("li.b_algo h2 a")
539
- for link in links[:self.num_results]:
540
- href = await link.get_attribute('href')
541
- if href:
542
- urls.append(href)
543
- elif search_engine == "google":
544
- links = await self.page.query_selector_all("div.g a[href^='http']")
545
- for link in links[:self.num_results]:
546
- href = await link.get_attribute('href')
547
- if href:
548
- urls.append(href)
549
- elif self.browser_engine == "pyppeteer":
550
- if search_engine == "bing":
551
- links = await self.page.querySelectorAll("li.b_algo h2 a")
552
- for link in links[:self.num_results]:
553
- href = await self.page.evaluate('el => el.getAttribute("href")', link)
554
- if href:
555
- urls.append(href)
556
- elif search_engine == "google":
557
- links = await self.page.querySelectorAll("div.g a[href^='http']")
558
- for link in links[:self.num_results]:
559
- href = await self.page.evaluate('el => el.getAttribute("href")', link)
560
- if href:
561
- urls.append(href)
562
- elif self.browser_engine == "splash":
563
- # Parse the HTML with BeautifulSoup
564
- soup = BeautifulSoup(self.browser.last_html, 'html.parser')
565
- if search_engine == "bing":
566
- links = soup.select("li.b_algo h2 a")
567
- for link in links[:self.num_results]:
568
- href = link.get("href")
569
- if href:
570
- urls.append(href)
571
- elif search_engine == "google":
572
- links = soup.select("div.g a[href^='http']")
573
- for link in links[:self.num_results]:
574
- href = link.get("href")
575
- if href:
576
- urls.append(href)
577
 
578
  return urls
579
  except Exception as e:
580
- logger.error(f"Error searching web: {e}")
581
  return []
582
 
583
  async def get_file_size(self, url):
 
584
  try:
585
  headers = {'User-Agent': get_random_user_agent()}
586
  response = requests.head(url, headers=headers, timeout=15)
@@ -593,6 +325,10 @@ class DownloadManager:
593
  return "Unknown Size"
594
 
595
  async def get_pdf_metadata(self, url):
 
 
 
 
596
  try:
597
  headers = {'User-Agent': get_random_user_agent()}
598
  response = requests.get(url, headers=headers, timeout=15, stream=True)
@@ -610,6 +346,7 @@ class DownloadManager:
610
  return {}
611
 
612
  async def extract_real_download_url(self, url):
 
613
  try:
614
  headers = {'User-Agent': get_random_user_agent()}
615
  response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
@@ -619,7 +356,7 @@ class DownloadManager:
619
  return url
620
 
621
  async def get_edu_exam_links(self, url):
622
- """Specialized method for educational exam websites that follows a common pattern."""
623
  try:
624
  logger.info(f"Fetching exam links from {url}")
625
  links = set()
@@ -630,7 +367,7 @@ class DownloadManager:
630
  response = requests.get(url, headers=headers, timeout=30)
631
 
632
  if response.status_code == 200:
633
- # Parse with BeautifulSoup for efficiency
634
  soup = BeautifulSoup(response.text, "html.parser")
635
  parsed_base = urlparse(url)
636
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
@@ -640,26 +377,22 @@ class DownloadManager:
640
  href = a["href"]
641
  full_url = urljoin(url, href)
642
 
643
- # Look for text clues
644
  link_text = a.get_text().lower()
645
 
646
- # Special patterns for exam sites (expanded list)
647
  url_patterns = [
648
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
649
  "/test/", "/download/", "/files/", "/assignments/",
650
- "paper_", "question_", "exam_", "test_", "past_",
651
- "assignment_", "sample_", "study_material", "notes_",
652
- "/resource/", "/subject/", "/course/", "/material/"
653
  ]
654
 
655
  text_patterns = [
656
  "exam", "paper", "test", "question", "past", "download",
657
- "assignment", "sample", "study", "material", "notes",
658
- "subject", "course", "resource", "pdf", "document",
659
- "view", "open", "get", "solution", "answer"
660
  ]
661
 
662
- # Check URL and text patterns
663
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
664
  any(pattern in link_text for pattern in text_patterns) or \
665
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
@@ -667,48 +400,74 @@ class DownloadManager:
667
  except Exception as e:
668
  logger.warning(f"Request-based extraction failed: {e}")
669
 
670
- # Use browser-based approach if needed
671
  if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
672
  logger.info("Using browser for enhanced link extraction")
673
 
674
- # Navigate to the page
675
- await self.browser.goto(url, timeout=45000)
676
 
677
- # Get page content and parse with BeautifulSoup
678
- content = await self.browser.content()
679
  soup = BeautifulSoup(content, "html.parser")
680
  parsed_base = urlparse(url)
681
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
682
 
683
- # Process all links on the page
684
  for a in soup.find_all("a", href=True):
685
  href = a["href"]
686
  full_url = urljoin(url, href)
687
  link_text = a.get_text().lower()
688
 
689
- # Apply the same filtering criteria
690
  url_patterns = [
691
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
692
  "/test/", "/download/", "/files/", "/assignments/",
693
- "paper_", "question_", "exam_", "test_", "past_",
694
- "assignment_", "sample_", "study_material", "notes_",
695
- "/resource/", "/subject/", "/course/", "/material/"
696
  ]
697
 
698
  text_patterns = [
699
  "exam", "paper", "test", "question", "past", "download",
700
- "assignment", "sample", "study", "material", "notes",
701
- "subject", "course", "resource", "pdf", "document",
702
- "view", "open", "get", "solution", "answer"
703
  ]
704
 
705
- # Check URL and text patterns
706
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
707
  any(pattern in link_text for pattern in text_patterns) or \
708
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
709
  links.add(full_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
- # Filter to likely exam documents
712
  filtered_links = []
713
  for link in links:
714
  # Common file extensions
@@ -719,8 +478,7 @@ class DownloadManager:
719
  # Common paths for exam documents
720
  if any(pattern in link.lower() for pattern in [
721
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
722
- "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
723
- "/resource/", "/material/", "/notes/", "/subjectmaterial/"
724
  ]):
725
  filtered_links.append(link)
726
 
@@ -732,6 +490,7 @@ class DownloadManager:
732
  return []
733
 
734
  async def extract_downloadable_files(self, url, custom_ext_list):
 
735
  found_files = []
736
  try:
737
  # Special handling for educational exam sites
@@ -765,7 +524,7 @@ class DownloadManager:
765
 
766
  # Get metadata for PDFs
767
  meta = {}
768
- if real_url.lower().endswith('.pdf'):
769
  try:
770
  meta = await self.get_pdf_metadata(real_url)
771
  except Exception:
@@ -776,18 +535,18 @@ class DownloadManager:
776
  'filename': filename,
777
  'size': size_str,
778
  'metadata': meta,
779
- 'source_url': url # Add source URL for better tracking
780
  })
781
 
782
  # If we found exam files with the specialized method, return them
783
  if found_files:
784
  return found_files
785
 
786
- # Standard extraction method for all pages
787
- await self.browser.goto(url, timeout=30000)
788
 
789
  # Get page content
790
- content = await self.browser.content()
791
  soup = BeautifulSoup(content, 'html.parser')
792
 
793
  # Define file extensions to look for
@@ -807,7 +566,7 @@ class DownloadManager:
807
 
808
  # Handle PHP and download links separately
809
  if '.php' in href.lower() or 'download' in href.lower():
810
- full_url = href if href.startswith('http') else urljoin(base_url, href)
811
  real_url = await self.extract_real_download_url(full_url)
812
  if real_url and real_url != full_url:
813
  filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
@@ -822,10 +581,10 @@ class DownloadManager:
822
 
823
  # Check for direct file extensions
824
  if any(href.lower().endswith(ext) for ext in all_exts):
825
- file_url = href if href.startswith('http') else urljoin(base_url, href)
826
  size_str = await self.get_file_size(file_url)
827
  meta = {}
828
- if file_url.lower().endswith('.pdf'):
829
  meta = await self.get_pdf_metadata(file_url)
830
  found_files.append({
831
  'url': file_url,
@@ -845,7 +604,7 @@ class DownloadManager:
845
  break
846
 
847
  if file_id:
848
- # Determine if it's a view-only file
849
  is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
850
 
851
  filename = f"gdrive_{file_id}"
@@ -869,7 +628,7 @@ class DownloadManager:
869
  for elem in soup.find_all(elem_tag):
870
  src = elem.get('src') or elem.get('data')
871
  if src and any(src.lower().endswith(ext) for ext in all_exts):
872
- file_url = src if src.startswith('http') else urljoin(base_url, src)
873
  found_files.append({
874
  'url': file_url,
875
  'filename': os.path.basename(file_url.split('?')[0]),
@@ -893,12 +652,12 @@ class DownloadManager:
893
  return []
894
 
895
  async def download_file(self, file_info, save_dir, referer=None):
896
- """Download a file and provide a direct download link"""
897
  file_url = file_info['url']
898
  fname = file_info['filename']
899
  referer = referer or file_info.get('source_url', 'https://www.google.com')
900
 
901
- # Create unique filename to avoid overwriting
902
  path = os.path.join(save_dir, fname)
903
  base, ext = os.path.splitext(fname)
904
  counter = 1
@@ -911,7 +670,7 @@ class DownloadManager:
911
  try:
912
  # Special handling for Google Drive files
913
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
914
- # For view-only Google Drive files, use specialized method
915
  is_view_only = file_info.get('metadata', {}).get('view_only', False)
916
  if is_view_only:
917
  result_path = await self.download_viewonly_google_drive(file_info, path)
@@ -967,7 +726,7 @@ class DownloadManager:
967
  return None
968
 
969
  async def download_viewonly_google_drive(self, file_info, save_path):
970
- """Download view-only Google Drive documents"""
971
  try:
972
  # Extract file ID
973
  file_id = file_info.get('metadata', {}).get('file_id')
@@ -993,173 +752,147 @@ class DownloadManager:
993
 
994
  logger.info(f"Downloading view-only Google Drive file: {file_id}")
995
 
996
- # Create a dedicated browser session
997
- if self.browser_engine == "playwright":
998
- from playwright.async_api import async_playwright
 
 
 
 
 
 
 
 
 
 
999
 
1000
- async with async_playwright() as p:
1001
- browser = await p.chromium.launch(
1002
- headless=True,
1003
- args=[
1004
- '--no-sandbox',
1005
- '--disable-setuid-sandbox',
1006
- '--disable-dev-shm-usage',
1007
- '--disable-web-security',
1008
- '--disable-features=IsolateOrigins,site-per-process',
1009
- '--disable-site-isolation-trials',
1010
- '--disable-blink-features=AutomationControlled'
1011
- ]
1012
- )
1013
-
1014
- # Create context with options for better handling
1015
- context = await browser.new_context(
1016
- viewport={'width': 1600, 'height': 1200},
1017
- user_agent=get_random_user_agent(),
1018
- accept_downloads=True,
1019
- ignore_https_errors=True
1020
- )
1021
-
1022
- # Add stealth script
1023
- await context.add_init_script("""
1024
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
1025
- Object.defineProperty(navigator, 'plugins', {
1026
- get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
1027
- });
1028
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
1029
- window.chrome = { runtime: {} };
1030
- """)
1031
 
1032
- page = await context.new_page()
 
1033
 
1034
- try:
1035
- # Visit the file
1036
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
1037
- await page.wait_for_load_state('networkidle')
 
1038
 
1039
- # Wait for content to load
1040
- await page.wait_for_timeout(5000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
 
1042
- # Create temporary directory for processing
1043
- temp_dir = tempfile.mkdtemp()
1044
 
1045
- # For PDF handling
1046
- if file_type == 'pdf':
1047
- # Create directory for screenshots
1048
- screenshots_dir = os.path.join(temp_dir, "screenshots")
1049
- os.makedirs(screenshots_dir, exist_ok=True)
1050
-
1051
- # Get page count
1052
- total_pages = await page.evaluate("""
1053
- () => {
1054
- // Look for page counters in the interface
1055
- const pageCounters = document.querySelectorAll('*');
1056
- for (const el of pageCounters) {
1057
- const text = el.textContent || '';
1058
- const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
1059
- if (match && match[2]) {
1060
- return parseInt(match[2]);
1061
- }
1062
- }
1063
-
1064
- // Look for paginated pages
1065
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1066
- if (pages.length > 0) return pages.length;
1067
-
1068
- // Default if we can't determine
1069
- return 20;
1070
- }
1071
- """)
1072
-
1073
- logger.info(f"PDF has approximately {total_pages} pages")
1074
-
1075
- # Take screenshots of each page
1076
- screenshots = []
1077
-
1078
- # First try with the page element method
1079
- for i in range(min(total_pages, 100)): # Limit to 100 pages for safety
1080
- try:
1081
- # Navigate to specific page
1082
- if i > 0:
1083
- await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()")
1084
- await page.wait_for_timeout(500)
1085
-
1086
- # Wait for the page to render
1087
- await page.wait_for_timeout(500)
1088
-
1089
- # Take screenshot
1090
- screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
1091
-
1092
- # Try to find the page element
1093
- page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})')
1094
- if page_element:
1095
- await page_element.screenshot(path=screenshot_path)
1096
  else:
1097
- # Fallback to viewport screenshot
1098
- await page.screenshot(path=screenshot_path)
1099
-
1100
- screenshots.append(screenshot_path)
1101
-
1102
- # Check if we should continue to next page
1103
- if i < total_pages - 1:
1104
- next_button = await page.query_selector('button[aria-label="Next page"]')
1105
- if next_button:
1106
- # Check if button is disabled
1107
- is_disabled = await next_button.get_attribute('disabled')
1108
- if is_disabled:
1109
- logger.info(f"Reached last page at page {i+1}")
1110
- break
1111
-
1112
- # Click next page
1113
- await next_button.click()
1114
- await page.wait_for_timeout(1000)
1115
- else:
1116
- logger.info("Next page button not found")
1117
- break
1118
- except Exception as e:
1119
- logger.error(f"Error capturing page {i+1}: {e}")
1120
- continue
1121
-
1122
- # Create PDF from screenshots
1123
- if screenshots:
1124
- # Get dimensions from first screenshot
1125
- first_img = Image.open(screenshots[0])
1126
- width, height = first_img.size
1127
-
1128
- # Create PDF
1129
- c = canvas.Canvas(save_path, pagesize=(width, height))
1130
- for screenshot in screenshots:
1131
- c.drawImage(screenshot, 0, 0, width, height)
1132
- c.showPage()
1133
- c.save()
1134
 
1135
- # Clean up screenshots
1136
- for screenshot in screenshots:
1137
- os.remove(screenshot)
1138
 
1139
- # Clean up temp directory
1140
- shutil.rmtree(temp_dir, ignore_errors=True)
 
 
 
 
1141
 
1142
- return save_path
1143
- else:
1144
- logger.error("No screenshots captured")
1145
- else:
1146
- # For non-PDF files, just take a screenshot
1147
- screenshot_path = os.path.join(temp_dir, "file.png")
1148
- await page.screenshot(path=screenshot_path)
 
 
 
 
 
 
 
 
 
 
1149
 
1150
- # Copy to destination
1151
- shutil.copy(screenshot_path, save_path)
 
1152
 
1153
- # Clean up
1154
- os.remove(screenshot_path)
1155
  shutil.rmtree(temp_dir, ignore_errors=True)
1156
 
1157
  return save_path
1158
- finally:
1159
- await browser.close()
1160
- elif self.browser_engine == "pyppeteer":
1161
- # Similar implementation for Pyppeteer
1162
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
1163
 
1164
  return None
1165
  except Exception as e:
@@ -1167,7 +900,7 @@ class DownloadManager:
1167
  return None
1168
 
1169
  async def get_sublinks(self, url, limit=10000):
1170
- """Extract all sublinks from a website"""
1171
  links = set()
1172
  try:
1173
  logger.info(f"Extracting sublinks from {url}")
@@ -1183,18 +916,17 @@ class DownloadManager:
1183
  logger.info(f"Found {len(links)} sublinks with specialized method")
1184
  return list(links)[:limit]
1185
 
1186
- # Standard link extraction for all sites
1187
- await self.browser.goto(url, timeout=30000)
1188
 
1189
  # Get page content
1190
- content = await self.browser.content()
1191
  soup = BeautifulSoup(content, 'html.parser')
1192
 
1193
- # Get base URL for resolving relative links
1194
  parsed_base = urlparse(url)
1195
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1196
 
1197
- # Extract all links from the page
1198
  for a in soup.find_all('a', href=True):
1199
  href = a['href']
1200
  if href and not href.startswith('javascript:') and not href.startswith('#'):
@@ -1220,85 +952,12 @@ class DownloadManager:
1220
  logger.error(f"Error extracting sublinks: {e}")
1221
  return list(links)[:limit]
1222
 
1223
- @celery_app.task
1224
- def download_file_task(file_info, save_dir, referer=None):
1225
- """Celery task for downloading files asynchronously"""
1226
- # This function runs in a separate worker process
1227
- file_url = file_info['url']
1228
- fname = file_info['filename']
1229
- referer = referer or file_info.get('source_url', 'https://www.google.com')
1230
-
1231
- # Create unique filename
1232
- path = os.path.join(save_dir, fname)
1233
- base, ext = os.path.splitext(fname)
1234
- counter = 1
1235
- while os.path.exists(path):
1236
- path = os.path.join(save_dir, f"{base}_{counter}{ext}")
1237
- counter += 1
1238
-
1239
- os.makedirs(save_dir, exist_ok=True)
1240
-
1241
- try:
1242
- # Handle Google Drive files
1243
- if "drive.google.com" in file_url or "docs.google.com" in file_url:
1244
- # Extract file ID
1245
- file_id = None
1246
- for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
1247
- match = re.search(pattern, file_url)
1248
- if match:
1249
- file_id = match.group(1)
1250
- break
1251
-
1252
- if file_id:
1253
- # Try direct download
1254
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
1255
- headers = {
1256
- 'User-Agent': get_random_user_agent(),
1257
- 'Referer': referer
1258
- }
1259
-
1260
- with requests.get(download_url, headers=headers, stream=True) as r:
1261
- if r.status_code == 200:
1262
- with open(path, 'wb') as f:
1263
- for chunk in r.iter_content(chunk_size=8192):
1264
- f.write(chunk)
1265
-
1266
- # Check if this is HTML (common for Google Drive restrictions)
1267
- with open(path, 'rb') as f:
1268
- content_start = f.read(100).decode('utf-8', errors='ignore')
1269
- if '<html' in content_start.lower():
1270
- os.remove(path)
1271
- return {'status': 'error', 'message': 'Received HTML instead of file'}
1272
-
1273
- return {'status': 'success', 'path': path}
1274
-
1275
- # Standard download for regular files
1276
- headers = {
1277
- 'User-Agent': get_random_user_agent(),
1278
- 'Referer': referer,
1279
- 'Accept': '*/*',
1280
- 'Accept-Encoding': 'gzip, deflate, br'
1281
- }
1282
-
1283
- with requests.get(file_url, headers=headers, stream=True) as r:
1284
- if r.status_code == 200:
1285
- with open(path, 'wb') as f:
1286
- for chunk in r.iter_content(chunk_size=8192):
1287
- f.write(chunk)
1288
-
1289
- return {'status': 'success', 'path': path}
1290
- else:
1291
- return {'status': 'error', 'message': f"HTTP error: {r.status_code}"}
1292
-
1293
- except Exception as e:
1294
- return {'status': 'error', 'message': str(e)}
1295
-
1296
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
1297
- """Perform deep search for files on a website and its subpages"""
1298
  if not custom_ext_list:
1299
  custom_ext_list = []
1300
 
1301
- # Create progress indicators
1302
  progress_text = st.empty()
1303
  progress_bar = st.progress(0)
1304
  file_count_text = st.empty()
@@ -1317,22 +976,23 @@ class DownloadManager:
1317
  total_links = len(sublinks)
1318
  progress_text.text(f"Found {total_links} sublinks to process")
1319
 
1320
- # Initialize all_files with main_files to ensure they're included
1321
  all_files = main_files.copy()
1322
 
1323
- # Process each sublink
1324
- for i, sublink in enumerate(sublinks, 1):
1325
- progress = i / max(total_links, 1) # Avoid division by zero
1326
- progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
1327
- progress_bar.progress(progress)
1328
-
1329
- try:
1330
- # Extract files from sublink
1331
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1332
- all_files.extend(sub_files)
1333
- file_count_text.text(f"Found {len(all_files)} total files")
1334
- except Exception as e:
1335
- logger.warning(f"Error processing sublink {sublink}: {e}")
 
1336
 
1337
  # Deduplicate files
1338
  seen_urls = set()
@@ -1360,7 +1020,7 @@ class DownloadManager:
1360
  progress_text.empty()
1361
  progress_bar.empty()
1362
 
1363
- # -------------------- Main App --------------------
1364
  def main():
1365
  st.title("Advanced File Downloader")
1366
 
@@ -1369,91 +1029,70 @@ def main():
1369
  st.session_state.initialized = True
1370
  st.session_state.discovered_files = []
1371
  st.session_state.current_url = None
1372
- st.session_state.google_creds = None
1373
  st.session_state.selected_files = []
1374
  st.session_state.do_deep_search = False
1375
  st.session_state.deep_search_url = None
1376
  st.session_state.search_results = []
1377
  st.session_state.download_urls = {} # For direct download links
1378
 
1379
- # Install dependencies if needed
1380
- if "dependencies_installed" not in st.session_state:
1381
- with st.spinner("Setting up dependencies. This may take a minute..."):
1382
- st.session_state.dependencies_installed = setup_dependencies()
1383
- check_services()
1384
 
1385
- # Sidebar options
1386
  with st.sidebar:
1387
- mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select")
1388
 
1389
- with st.expander("Search Options", expanded=True):
1390
- search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine")
1391
- browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine")
1392
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
1393
  help="Enter extensions like .csv, .txt")
1394
  max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
1395
  sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
1396
-
1397
- with st.expander("Advanced Options", expanded=False):
1398
  use_proxy = st.checkbox("Use Proxy", key="use_proxy")
1399
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
1400
  use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
1401
  help="Makes browser harder to detect as automated")
1402
- enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"],
1403
- key="enable_intercept",
1404
- help="Intercept network traffic to find additional files")
1405
- if enable_network_intercept:
1406
- NETWORK_INTERCEPTOR_CONFIG["enabled"] = True
1407
- intercept_types = st.multiselect("Intercept Types",
1408
- ["xhr", "fetch", "document", "media", "stylesheet", "image", "font"],
1409
- default=["xhr", "fetch", "document", "media"],
1410
- key="intercept_types")
1411
- NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types
1412
- else:
1413
- NETWORK_INTERCEPTOR_CONFIG["enabled"] = False
1414
 
1415
- with st.expander("Google Drive Integration", expanded=False):
1416
- if st.button("Start Google Sign-In", key="google_signin_btn"):
1417
- auth_url = get_google_auth_url()
1418
- st.markdown(f"[Click here to authorize]({auth_url})")
1419
- auth_code = st.text_input("Enter authorization code", key="auth_code_input")
1420
- if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
1421
- creds, msg = exchange_code_for_credentials(auth_code)
1422
- st.session_state.google_creds = creds
1423
- st.write(msg)
 
1424
 
1425
  # Main content area
1426
  if mode == "Manual URL":
1427
  st.header("Manual URL Mode")
1428
  url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
1429
 
1430
- col1, col2 = st.columns([3, 1])
1431
- with col1:
1432
- if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
1433
- if url:
1434
- # Process custom extensions
1435
- custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
 
 
 
 
 
 
 
 
1436
 
1437
- with st.spinner("Searching for files..."):
1438
- async def run_deep_search():
1439
- async with DownloadManager(
1440
- browser_engine=browser_engine,
1441
- use_proxy=use_proxy,
1442
- proxy=proxy,
1443
- use_stealth=use_stealth
1444
- ) as dm:
1445
- files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
1446
- return files
1447
-
1448
- # Run the search
1449
- files = asyncio.run(run_deep_search())
1450
-
1451
- if files:
1452
- st.session_state.discovered_files = files
1453
- st.session_state.current_url = url
1454
- st.success(f"Found {len(files)} files!")
1455
- else:
1456
- st.warning("No files found.")
1457
 
1458
  # Display and process discovered files
1459
  if st.session_state.discovered_files:
@@ -1482,12 +1121,6 @@ def main():
1482
  file_info = f"{filename} ({size})"
1483
 
1484
  file_options.append((i, file_info))
1485
-
1486
- # Generate direct download URL for this file
1487
- if i not in st.session_state.download_urls:
1488
- # Generate a unique key for this file
1489
- file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode()
1490
- st.session_state.download_urls[i] = file_key
1491
 
1492
  # File selection multiselect
1493
  selected_indices = st.multiselect(
@@ -1500,7 +1133,7 @@ def main():
1500
 
1501
  st.session_state.selected_files = selected_indices
1502
 
1503
- # Display individual files with direct download links
1504
  if files:
1505
  st.subheader("Available Files")
1506
  for i, file in enumerate(files):
@@ -1508,8 +1141,8 @@ def main():
1508
  st.write(f"Source: {file.get('source_url', 'Unknown')}")
1509
  st.write(f"URL: {file['url']}")
1510
 
1511
- # Download button for this specific file
1512
- if st.button(f"Download this file", key=f"download_single_{i}"):
1513
  with st.spinner(f"Downloading {file['filename']}..."):
1514
  # Create downloads directory
1515
  download_dir = "./downloads"
@@ -1518,7 +1151,6 @@ def main():
1518
  # Download the file
1519
  async def download_single():
1520
  async with DownloadManager(
1521
- browser_engine=browser_engine,
1522
  use_proxy=use_proxy,
1523
  proxy=proxy,
1524
  use_stealth=use_stealth
@@ -1551,15 +1183,13 @@ def main():
1551
  if selected_indices:
1552
  st.subheader("Batch Download Options")
1553
 
1554
- col1, col2, col3, col4 = st.columns(4)
1555
  with col1:
1556
  download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1557
  with col2:
1558
  create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1559
  with col3:
1560
  delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
1561
- with col4:
1562
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
1563
 
1564
  if st.button("Download Selected Files", key="batch_download_btn"):
1565
  with st.spinner(f"Downloading {len(selected_indices)} files..."):
@@ -1573,7 +1203,6 @@ def main():
1573
 
1574
  async def download_batch():
1575
  async with DownloadManager(
1576
- browser_engine=browser_engine,
1577
  use_proxy=use_proxy,
1578
  proxy=proxy,
1579
  use_stealth=use_stealth
@@ -1614,24 +1243,6 @@ def main():
1614
  key="download_zip_btn"
1615
  )
1616
 
1617
- # Upload to Google Drive if requested
1618
- if upload_to_drive and st.session_state.google_creds:
1619
- with st.spinner("Uploading to Google Drive..."):
1620
- drive_service = googleapiclient.discovery.build(
1621
- "drive", "v3", credentials=st.session_state.google_creds
1622
- )
1623
- folder_id = create_drive_folder(
1624
- drive_service, f"Downloads_{get_domain(url)}"
1625
- )
1626
- drive_id = google_drive_upload(
1627
- zip_path, st.session_state.google_creds, folder_id
1628
- )
1629
-
1630
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
1631
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
1632
- else:
1633
- st.error(drive_id)
1634
-
1635
  # Delete original files if requested
1636
  if delete_after:
1637
  for path in downloaded_paths:
@@ -1650,17 +1261,16 @@ def main():
1650
 
1651
  if st.button("Search", key="web_search_btn"):
1652
  if query:
1653
- with st.spinner("Searching the web..."):
1654
  async def run_search():
1655
  async with DownloadManager(
1656
- browser_engine=browser_engine,
1657
  use_proxy=use_proxy,
1658
  proxy=proxy,
1659
  query=query,
1660
  num_results=num_results,
1661
  use_stealth=use_stealth
1662
  ) as dm:
1663
- urls = await dm.search_web(search_engine)
1664
  return urls
1665
 
1666
  urls = asyncio.run(run_search())
@@ -1693,7 +1303,6 @@ def main():
1693
  with st.spinner("Searching for files..."):
1694
  async def deep_search_result():
1695
  async with DownloadManager(
1696
- browser_engine=browser_engine,
1697
  use_proxy=use_proxy,
1698
  proxy=proxy,
1699
  use_stealth=use_stealth
@@ -1709,131 +1318,63 @@ def main():
1709
  else:
1710
  st.warning("No files found on this page.")
1711
 
1712
- elif mode == "Single File":
1713
- st.header("Single File Download")
1714
 
1715
  # View-only Google Drive download
1716
- with st.expander("Download View-Only Google Drive Document", expanded=True):
1717
- st.write("Download protected/view-only Google Drive documents")
1718
-
1719
- file_id = st.text_input(
1720
- "Google Drive File ID",
1721
- placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
1722
- key="drive_file_id"
1723
- )
1724
-
1725
- if st.button("Download Document", key="drive_download_btn") and file_id:
1726
- with st.spinner("Downloading view-only document... (this may take a minute)"):
1727
- # Create download directory
1728
- download_dir = "./downloads"
1729
- os.makedirs(download_dir, exist_ok=True)
1730
-
1731
- # Set output path
1732
- output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
1733
-
1734
- # Download the file
1735
- async def download_drive_file():
1736
- async with DownloadManager(
1737
- browser_engine=browser_engine,
1738
- use_proxy=use_proxy,
1739
- proxy=proxy,
1740
- use_stealth=use_stealth
1741
- ) as dm:
1742
- file_info = {
1743
- 'url': f"https://drive.google.com/file/d/{file_id}/view",
1744
- 'filename': f"gdrive_{file_id}.pdf",
1745
- 'metadata': {'file_id': file_id, 'view_only': True}
1746
- }
1747
- return await dm.download_viewonly_google_drive(file_info, output_path)
1748
-
1749
- result_path = asyncio.run(download_drive_file())
1750
-
1751
- if result_path:
1752
- st.success("Document downloaded successfully!")
1753
-
1754
- # Provide download link
1755
- with open(result_path, "rb") as f:
1756
- file_bytes = f.read()
1757
-
1758
- st.download_button(
1759
- label="Download PDF",
1760
- data=file_bytes,
1761
- file_name=os.path.basename(result_path),
1762
- mime="application/pdf",
1763
- key="drive_pdf_download"
1764
- )
1765
- else:
1766
- st.error("Failed to download the document. Please check the file ID and try again.")
1767
 
1768
- # Direct URL download
1769
- with st.expander("Download from Direct URL", expanded=True):
1770
- st.write("Download a file from a direct URL")
1771
-
1772
- file_url = st.text_input(
1773
- "File URL",
1774
- placeholder="https://example.com/file.pdf",
1775
- key="direct_url"
1776
- )
1777
-
1778
- file_name = st.text_input(
1779
- "Save as (optional)",
1780
- placeholder="Leave blank to use original filename",
1781
- key="save_filename"
1782
- )
1783
-
1784
- if st.button("Download File", key="direct_download_btn") and file_url:
1785
- with st.spinner("Downloading file..."):
1786
- # Create download directory
1787
- download_dir = "./downloads"
1788
- os.makedirs(download_dir, exist_ok=True)
1789
-
1790
- # Determine filename
1791
- if not file_name:
1792
- file_name = os.path.basename(urlparse(file_url).path)
1793
- if not file_name or file_name == '/':
1794
- file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}"
1795
-
1796
- # Create file info
1797
- file_info = {
1798
- 'url': file_url,
1799
- 'filename': file_name,
1800
- 'metadata': {}
1801
- }
1802
-
1803
- # Download the file
1804
- async def download_direct_file():
1805
- async with DownloadManager(
1806
- browser_engine=browser_engine,
1807
- use_proxy=use_proxy,
1808
- proxy=proxy,
1809
- use_stealth=use_stealth
1810
- ) as dm:
1811
- return await dm.download_file(file_info, download_dir)
1812
 
1813
- file_path = asyncio.run(download_direct_file())
 
 
1814
 
1815
- if file_path:
1816
- st.success(f"File downloaded successfully to {file_path}")
1817
-
1818
- # Provide download link
1819
- with open(file_path, "rb") as f:
1820
- file_bytes = f.read()
1821
-
1822
- mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
1823
-
1824
- st.download_button(
1825
- label=f"Download {os.path.basename(file_path)}",
1826
- data=file_bytes,
1827
- file_name=os.path.basename(file_path),
1828
- mime=mime_type,
1829
- key="direct_file_download"
1830
- )
1831
- else:
1832
- st.error("Failed to download the file. Please check the URL and try again.")
1833
 
1834
  # Footer
1835
  st.markdown("---")
1836
- st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies")
1837
 
1838
  # Run the app
1839
  if __name__ == "__main__":
 
1
+ # app.py
2
  import streamlit as st
3
  import os
4
  import asyncio
 
22
  from reportlab.lib.pagesizes import letter
23
  from reportlab.pdfgen import canvas
24
 
25
+ # Advanced imports - only import what's installed
 
 
 
 
 
 
 
26
  import requests
27
+ from bs4 import BeautifulSoup
28
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
29
+
30
+ # Optional imports with fallbacks
31
+ try:
32
+ from PyPDF2 import PdfReader
33
+ except ImportError:
34
+ PdfReader = None
35
+
36
+ try:
37
+ import google_auth_oauthlib.flow
38
+ import googleapiclient.discovery
39
+ import google.auth.transport.requests
40
+ import googleapiclient.http
41
+ GOOGLE_DRIVE_AVAILABLE = True
42
+ except ImportError:
43
+ GOOGLE_DRIVE_AVAILABLE = False
44
 
45
  # Configure page and logging
46
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
47
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
48
  logger = logging.getLogger(__name__)
49
 
50
+ # Google OAuth Config
 
 
 
51
  GOOGLE_OAUTH_CONFIG = {
52
  "web": {
53
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
 
60
  }
61
  }
62
 
63
+ # User Agent Settings
64
  USER_AGENTS = [
65
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
66
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
 
68
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
69
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
70
  'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
 
71
  ]
72
 
73
+ # Network Interception Configuration
 
 
 
 
74
  NETWORK_INTERCEPTOR_CONFIG = {
75
  "enabled": False,
76
  "intercept_types": ["xhr", "fetch", "document", "media"],
 
78
  "intercept_folder": "./intercepted_data"
79
  }
80
 
81
+ # Utility Functions
82
  def get_random_user_agent():
83
  return random.choice(USER_AGENTS)
84
 
 
114
  """Check if URL is a valid file URL based on extension"""
115
  return any(url.lower().endswith(ext) for ext in extensions)
116
 
117
+ # Google Drive Functions
118
  def get_google_auth_url():
119
+ if not GOOGLE_DRIVE_AVAILABLE:
120
+ return None
121
+
122
  client_config = GOOGLE_OAUTH_CONFIG["web"]
123
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
124
  {"web": client_config},
 
133
  return authorization_url
134
 
135
  def exchange_code_for_credentials(auth_code):
136
+ if not GOOGLE_DRIVE_AVAILABLE:
137
+ return None, "Google Drive API not available. Install google-auth-oauthlib and google-api-python-client."
138
+
139
  if not auth_code.strip():
140
  return None, "No code provided."
141
  try:
 
154
  return None, f"Error during token exchange: {e}"
155
 
156
  def google_drive_upload(file_path, credentials, folder_id=None):
157
+ if not GOOGLE_DRIVE_AVAILABLE:
158
+ return "Google Drive API not available"
159
+
160
  try:
161
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
162
  file_metadata = {'name': os.path.basename(file_path)}
 
169
  return f"Error uploading to Drive: {str(e)}"
170
 
171
  def create_drive_folder(drive_service, name):
172
+ if not GOOGLE_DRIVE_AVAILABLE:
173
+ return None
174
+
175
  folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
176
  folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
177
  return folder.get('id')
178
 
179
+ # Setup Playwright
180
+ def setup_playwright_dependencies():
181
+ """Install required system dependencies for Playwright"""
182
  try:
183
  # Install system dependencies
184
  subprocess.run(['apt-get', 'update', '-y'], check=True)
185
  packages = [
186
  'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
187
  'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
188
+ 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
 
189
  ]
190
  subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
191
 
192
+ # Install Playwright browser
193
+ subprocess.run(['python', '-m', 'playwright', 'install', 'chromium'], check=True)
 
 
 
 
194
 
195
  st.success("Dependencies installed successfully!")
196
  return True
197
  except Exception as e:
198
  st.error(f"Error installing dependencies: {e}")
199
+ st.info("You may need to manually install dependencies.")
200
  logger.error(f"Setup error: {e}")
201
  traceback.print_exc()
202
  return False
203
 
204
+ # Download Manager Class
205
+ class DownloadManager:
206
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  self.use_proxy = use_proxy
208
  self.proxy = proxy
209
+ self.query = query
210
+ self.num_results = num_results
211
+ self.use_stealth = use_stealth
212
+ self.playwright = None
213
  self.browser = None
214
  self.context = None
215
  self.page = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ # Create intercepted data folder if enabled
218
+ if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
219
+ os.makedirs(NETWORK_INTERCEPTOR_CONFIG["intercept_folder"], exist_ok=True)
220
+
221
+ async def __aenter__(self):
222
  self.playwright = await async_playwright().start()
223
+
224
+ # Configure browser launch options
225
  browser_args = [
226
  '--no-sandbox',
227
  '--disable-setuid-sandbox',
 
230
  '--disable-features=IsolateOrigins,site-per-process',
231
  ]
232
 
233
+ if self.use_stealth:
234
  browser_args.extend([
235
  '--disable-blink-features=AutomationControlled',
236
  '--disable-features=IsolateOrigins'
 
244
  if self.use_proxy and self.proxy:
245
  launch_options["proxy"] = {"server": self.proxy}
246
 
247
+ # Launch browser
248
  self.browser = await self.playwright.chromium.launch(**launch_options)
249
 
250
+ # Configure context options
251
  context_options = {
252
  "viewport": {"width": 1920, "height": 1080},
253
  "user_agent": get_random_user_agent(),
 
256
  "accept_downloads": True
257
  }
258
 
259
+ # Create context and apply stealth features
260
  self.context = await self.browser.new_context(**context_options)
261
 
262
+ if self.use_stealth:
 
263
  await self.context.add_init_script("""
264
  Object.defineProperty(navigator, 'webdriver', { get: () => false });
265
  Object.defineProperty(navigator, 'plugins', {
 
269
  window.chrome = { runtime: {} };
270
  """)
271
 
272
+ # Create page and set headers
273
  self.page = await self.context.new_page()
274
+ await self.page.set_extra_http_headers({
275
+ 'Accept-Language': 'en-US,en;q=0.9',
276
+ 'Accept-Encoding': 'gzip, deflate, br',
277
+ 'DNT': '1',
278
+ 'Referer': 'https://www.google.com/',
279
+ 'Sec-Fetch-Dest': 'document',
280
+ 'Sec-Fetch-Mode': 'navigate',
281
+ 'Sec-Fetch-Site': 'cross-site',
282
+ 'Sec-Fetch-User': '?1',
283
+ 'Upgrade-Insecure-Requests': '1'
284
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  return self
287
 
288
  async def __aexit__(self, exc_type, exc_val, exc_tb):
289
+ if self.browser:
290
+ await self.browser.close()
291
+ if self.playwright:
292
+ await self.playwright.stop()
293
 
294
+ async def search_bing(self):
295
+ """Search Bing for results"""
296
  urls = []
297
  try:
298
+ search_url = f"https://www.bing.com/search?q={self.query}"
299
+ await self.page.goto(search_url, timeout=30000)
300
+ await self.page.wait_for_load_state('networkidle')
 
 
 
 
 
301
 
302
+ # Extract search results
303
+ links = await self.page.query_selector_all("li.b_algo h2 a")
304
+ for link in links[:self.num_results]:
305
+ href = await link.get_attribute('href')
306
+ if href:
307
+ urls.append(href)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  return urls
310
  except Exception as e:
311
+ logger.error(f"Error searching Bing: {e}")
312
  return []
313
 
314
  async def get_file_size(self, url):
315
+ """Get file size by making a HEAD request"""
316
  try:
317
  headers = {'User-Agent': get_random_user_agent()}
318
  response = requests.head(url, headers=headers, timeout=15)
 
325
  return "Unknown Size"
326
 
327
  async def get_pdf_metadata(self, url):
328
+ """Extract metadata from PDF files"""
329
+ if not PdfReader:
330
+ return {}
331
+
332
  try:
333
  headers = {'User-Agent': get_random_user_agent()}
334
  response = requests.get(url, headers=headers, timeout=15, stream=True)
 
346
  return {}
347
 
348
  async def extract_real_download_url(self, url):
349
+ """Follow redirects to get the final download URL"""
350
  try:
351
  headers = {'User-Agent': get_random_user_agent()}
352
  response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
 
356
  return url
357
 
358
  async def get_edu_exam_links(self, url):
359
+ """Specialized method for educational exam websites"""
360
  try:
361
  logger.info(f"Fetching exam links from {url}")
362
  links = set()
 
367
  response = requests.get(url, headers=headers, timeout=30)
368
 
369
  if response.status_code == 200:
370
+ # Parse with BeautifulSoup
371
  soup = BeautifulSoup(response.text, "html.parser")
372
  parsed_base = urlparse(url)
373
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
377
  href = a["href"]
378
  full_url = urljoin(url, href)
379
 
380
+ # Get link text
381
  link_text = a.get_text().lower()
382
 
383
+ # Define patterns to look for
384
  url_patterns = [
385
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
386
  "/test/", "/download/", "/files/", "/assignments/",
387
+ "paper_", "question_", "exam_", "test_", "past_"
 
 
388
  ]
389
 
390
  text_patterns = [
391
  "exam", "paper", "test", "question", "past", "download",
392
+ "assignment", "sample", "study", "material", "notes"
 
 
393
  ]
394
 
395
+ # Check for matches
396
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
397
  any(pattern in link_text for pattern in text_patterns) or \
398
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
 
400
  except Exception as e:
401
  logger.warning(f"Request-based extraction failed: {e}")
402
 
403
+ # Use browser if few links were found or for specific sites
404
  if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
405
  logger.info("Using browser for enhanced link extraction")
406
 
407
+ # Navigate to page
408
+ await self.page.goto(url, timeout=45000)
409
 
410
+ # Get page content
411
+ content = await self.page.content()
412
  soup = BeautifulSoup(content, "html.parser")
413
  parsed_base = urlparse(url)
414
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
415
 
416
+ # Find links in page
417
  for a in soup.find_all("a", href=True):
418
  href = a["href"]
419
  full_url = urljoin(url, href)
420
  link_text = a.get_text().lower()
421
 
422
+ # Use the same patterns as above
423
  url_patterns = [
424
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
425
  "/test/", "/download/", "/files/", "/assignments/",
426
+ "paper_", "question_", "exam_", "test_", "past_"
 
 
427
  ]
428
 
429
  text_patterns = [
430
  "exam", "paper", "test", "question", "past", "download",
431
+ "assignment", "sample", "study", "material", "notes"
 
 
432
  ]
433
 
 
434
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
435
  any(pattern in link_text for pattern in text_patterns) or \
436
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
437
  links.add(full_url)
438
+
439
+ # Try to click on elements that might reveal more links
440
+ try:
441
+ # Find and click buttons that might show more content
442
+ buttons = await self.page.query_selector_all('input[type="button"], button')
443
+ for button in buttons:
444
+ button_text = await button.text_content() or ""
445
+ button_value = await button.get_attribute("value") or ""
446
+
447
+ # Only click on promising buttons
448
+ if any(keyword in (button_text + button_value).lower() for keyword in
449
+ ["show", "view", "display", "list", "exam", "paper", "test"]):
450
+ try:
451
+ await button.click()
452
+ await self.page.wait_for_timeout(1000)
453
+
454
+ # Get any new links
455
+ new_content = await self.page.content()
456
+ new_soup = BeautifulSoup(new_content, "html.parser")
457
+ for a in new_soup.find_all("a", href=True):
458
+ href = a["href"]
459
+ full_url = urljoin(url, href)
460
+
461
+ # Check if it's a file link
462
+ if any(full_url.lower().endswith(ext) for ext in
463
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
464
+ links.add(full_url)
465
+ except Exception as e:
466
+ logger.warning(f"Error clicking button: {e}")
467
+ except Exception as e:
468
+ logger.warning(f"Error with interactive elements: {e}")
469
 
470
+ # Filter links to likely contain exam documents
471
  filtered_links = []
472
  for link in links:
473
  # Common file extensions
 
478
  # Common paths for exam documents
479
  if any(pattern in link.lower() for pattern in [
480
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
481
+ "/pastpapers/", "/questionpapers/", "/tests/"
 
482
  ]):
483
  filtered_links.append(link)
484
 
 
490
  return []
491
 
492
  async def extract_downloadable_files(self, url, custom_ext_list):
493
+ """Extract all downloadable files from a webpage"""
494
  found_files = []
495
  try:
496
  # Special handling for educational exam sites
 
524
 
525
  # Get metadata for PDFs
526
  meta = {}
527
+ if real_url.lower().endswith('.pdf') and PdfReader:
528
  try:
529
  meta = await self.get_pdf_metadata(real_url)
530
  except Exception:
 
535
  'filename': filename,
536
  'size': size_str,
537
  'metadata': meta,
538
+ 'source_url': url # Keep track of source page
539
  })
540
 
541
  # If we found exam files with the specialized method, return them
542
  if found_files:
543
  return found_files
544
 
545
+ # Standard extraction method for regular websites
546
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
547
 
548
  # Get page content
549
+ content = await self.page.content()
550
  soup = BeautifulSoup(content, 'html.parser')
551
 
552
  # Define file extensions to look for
 
566
 
567
  # Handle PHP and download links separately
568
  if '.php' in href.lower() or 'download' in href.lower():
569
+ full_url = href if href.startswith('http') else urljoin(url, href)
570
  real_url = await self.extract_real_download_url(full_url)
571
  if real_url and real_url != full_url:
572
  filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
 
581
 
582
  # Check for direct file extensions
583
  if any(href.lower().endswith(ext) for ext in all_exts):
584
+ file_url = href if href.startswith('http') else urljoin(url, href)
585
  size_str = await self.get_file_size(file_url)
586
  meta = {}
587
+ if file_url.lower().endswith('.pdf') and PdfReader:
588
  meta = await self.get_pdf_metadata(file_url)
589
  found_files.append({
590
  'url': file_url,
 
604
  break
605
 
606
  if file_id:
607
+ # Determine if it's view-only
608
  is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
609
 
610
  filename = f"gdrive_{file_id}"
 
628
  for elem in soup.find_all(elem_tag):
629
  src = elem.get('src') or elem.get('data')
630
  if src and any(src.lower().endswith(ext) for ext in all_exts):
631
+ file_url = src if src.startswith('http') else urljoin(url, src)
632
  found_files.append({
633
  'url': file_url,
634
  'filename': os.path.basename(file_url.split('?')[0]),
 
652
  return []
653
 
654
  async def download_file(self, file_info, save_dir, referer=None):
655
+ """Download a file and save it to disk"""
656
  file_url = file_info['url']
657
  fname = file_info['filename']
658
  referer = referer or file_info.get('source_url', 'https://www.google.com')
659
 
660
+ # Create unique filename
661
  path = os.path.join(save_dir, fname)
662
  base, ext = os.path.splitext(fname)
663
  counter = 1
 
670
  try:
671
  # Special handling for Google Drive files
672
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
673
+ # For view-only Google Drive files
674
  is_view_only = file_info.get('metadata', {}).get('view_only', False)
675
  if is_view_only:
676
  result_path = await self.download_viewonly_google_drive(file_info, path)
 
726
  return None
727
 
728
  async def download_viewonly_google_drive(self, file_info, save_path):
729
+ """Download view-only Google Drive documents using Playwright"""
730
  try:
731
  # Extract file ID
732
  file_id = file_info.get('metadata', {}).get('file_id')
 
752
 
753
  logger.info(f"Downloading view-only Google Drive file: {file_id}")
754
 
755
+ # Create a dedicated browser instance for this operation
756
+ async with async_playwright() as p:
757
+ browser = await p.chromium.launch(
758
+ headless=True,
759
+ args=[
760
+ '--no-sandbox',
761
+ '--disable-setuid-sandbox',
762
+ '--disable-dev-shm-usage',
763
+ '--disable-web-security',
764
+ '--disable-features=IsolateOrigins,site-per-process',
765
+ '--disable-blink-features=AutomationControlled'
766
+ ]
767
+ )
768
 
769
+ # Create context
770
+ context = await browser.new_context(
771
+ viewport={'width': 1600, 'height': 1200},
772
+ user_agent=get_random_user_agent(),
773
+ accept_downloads=True
774
+ )
775
+
776
+ # Add stealth script
777
+ await context.add_init_script("""
778
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
779
+ Object.defineProperty(navigator, 'plugins', {
780
+ get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
781
+ });
782
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
783
+ window.chrome = { runtime: {} };
784
+ """)
785
+
786
+ page = await context.new_page()
787
+
788
+ try:
789
+ # Navigate to the file
790
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
791
+ await page.wait_for_load_state('networkidle')
792
+ await page.wait_for_timeout(5000) # Wait for rendering
 
 
 
 
 
 
 
793
 
794
+ # Create temp directory
795
+ temp_dir = tempfile.mkdtemp()
796
 
797
+ # For PDF files, take screenshots of each page
798
+ if file_type == 'pdf':
799
+ # Create directory for screenshots
800
+ screenshots_dir = os.path.join(temp_dir, "screenshots")
801
+ os.makedirs(screenshots_dir, exist_ok=True)
802
 
803
+ # Get page count estimation
804
+ total_pages = await page.evaluate("""
805
+ () => {
806
+ // Look for page counters
807
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
808
+ const text = el.textContent || '';
809
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
810
+ });
811
+
812
+ if (pageCounters.length > 0) {
813
+ const text = pageCounters[0].textContent || '';
814
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
815
+ if (match && match[2]) return parseInt(match[2]);
816
+ }
817
+
818
+ // Look for page elements
819
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
820
+ if (pages.length > 0) return pages.length;
821
+
822
+ // Default
823
+ return 20;
824
+ }
825
+ """)
826
 
827
+ logger.info(f"PDF has approximately {total_pages} pages")
 
828
 
829
+ # Capture screenshots page by page
830
+ screenshots = []
831
+ for i in range(min(total_pages, 100)): # Limit to 100 pages
832
+ try:
833
+ # Go to specific page
834
+ if i > 0:
835
+ next_button = await page.query_selector('button[aria-label="Next page"]')
836
+ if next_button:
837
+ await next_button.click()
838
+ await page.wait_for_timeout(1000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  else:
840
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
 
842
+ # Take screenshot
843
+ screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
 
844
 
845
+ # Try to find page element
846
+ page_element = await page.query_selector('.drive-viewer-paginated-page')
847
+ if page_element:
848
+ await page_element.screenshot(path=screenshot_path)
849
+ else:
850
+ await page.screenshot(path=screenshot_path)
851
 
852
+ screenshots.append(screenshot_path)
853
+ except Exception as e:
854
+ logger.error(f"Error capturing page {i+1}: {e}")
855
+ continue
856
+
857
+ # Create PDF from screenshots
858
+ if screenshots:
859
+ # Get dimensions from first screenshot
860
+ first_img = Image.open(screenshots[0])
861
+ width, height = first_img.size
862
+
863
+ # Create PDF
864
+ c = canvas.Canvas(save_path, pagesize=(width, height))
865
+ for screenshot in screenshots:
866
+ c.drawImage(screenshot, 0, 0, width, height)
867
+ c.showPage()
868
+ c.save()
869
 
870
+ # Clean up screenshots
871
+ for screenshot in screenshots:
872
+ os.remove(screenshot)
873
 
874
+ # Clean up temp directory
 
875
  shutil.rmtree(temp_dir, ignore_errors=True)
876
 
877
  return save_path
878
+ else:
879
+ logger.error("No screenshots captured")
880
+ else:
881
+ # For non-PDF files, just take a screenshot
882
+ screenshot_path = os.path.join(temp_dir, "file.png")
883
+ await page.screenshot(path=screenshot_path)
884
+
885
+ # Copy to destination
886
+ shutil.copy(screenshot_path, save_path)
887
+
888
+ # Clean up
889
+ os.remove(screenshot_path)
890
+ shutil.rmtree(temp_dir, ignore_errors=True)
891
+
892
+ return save_path
893
+
894
+ finally:
895
+ await browser.close()
896
 
897
  return None
898
  except Exception as e:
 
900
  return None
901
 
902
  async def get_sublinks(self, url, limit=10000):
903
+ """Extract all sublinks from a webpage"""
904
  links = set()
905
  try:
906
  logger.info(f"Extracting sublinks from {url}")
 
916
  logger.info(f"Found {len(links)} sublinks with specialized method")
917
  return list(links)[:limit]
918
 
919
+ # Navigate to the page
920
+ await self.page.goto(url, timeout=30000)
921
 
922
  # Get page content
923
+ content = await self.page.content()
924
  soup = BeautifulSoup(content, 'html.parser')
925
 
926
+ # Extract all links from the page
927
  parsed_base = urlparse(url)
928
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
929
 
 
930
  for a in soup.find_all('a', href=True):
931
  href = a['href']
932
  if href and not href.startswith('javascript:') and not href.startswith('#'):
 
952
  logger.error(f"Error extracting sublinks: {e}")
953
  return list(links)[:limit]
954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
956
+ """Perform deep search for files on website and its subpages"""
957
  if not custom_ext_list:
958
  custom_ext_list = []
959
 
960
+ # Set up progress indicators
961
  progress_text = st.empty()
962
  progress_bar = st.progress(0)
963
  file_count_text = st.empty()
 
976
  total_links = len(sublinks)
977
  progress_text.text(f"Found {total_links} sublinks to process")
978
 
979
+ # Always include main page files
980
  all_files = main_files.copy()
981
 
982
+ # Process each sublink if there are any
983
+ if sublinks:
984
+ for i, sublink in enumerate(sublinks, 1):
985
+ progress = i / max(total_links, 1) # Avoid division by zero
986
+ progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
987
+ progress_bar.progress(progress)
988
+
989
+ try:
990
+ # Extract files from sublink
991
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
992
+ all_files.extend(sub_files)
993
+ file_count_text.text(f"Found {len(all_files)} total files")
994
+ except Exception as e:
995
+ logger.warning(f"Error processing sublink {sublink}: {e}")
996
 
997
  # Deduplicate files
998
  seen_urls = set()
 
1020
  progress_text.empty()
1021
  progress_bar.empty()
1022
 
1023
+ # Main App
1024
  def main():
1025
  st.title("Advanced File Downloader")
1026
 
 
1029
  st.session_state.initialized = True
1030
  st.session_state.discovered_files = []
1031
  st.session_state.current_url = None
 
1032
  st.session_state.selected_files = []
1033
  st.session_state.do_deep_search = False
1034
  st.session_state.deep_search_url = None
1035
  st.session_state.search_results = []
1036
  st.session_state.download_urls = {} # For direct download links
1037
 
1038
+ # Install Playwright if needed
1039
+ if "playwright_installed" not in st.session_state:
1040
+ with st.spinner("Setting up Playwright. This may take a minute..."):
1041
+ st.session_state.playwright_installed = setup_playwright_dependencies()
 
1042
 
1043
+ # Sidebar settings
1044
  with st.sidebar:
1045
+ mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Google Drive"], key="mode_select")
1046
 
1047
+ with st.expander("Advanced Options", expanded=True):
 
 
1048
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
1049
  help="Enter extensions like .csv, .txt")
1050
  max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
1051
  sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
 
 
1052
  use_proxy = st.checkbox("Use Proxy", key="use_proxy")
1053
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
1054
  use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
1055
  help="Makes browser harder to detect as automated")
 
 
 
 
 
 
 
 
 
 
 
 
1056
 
1057
+ if GOOGLE_DRIVE_AVAILABLE:
1058
+ with st.expander("Google Drive Integration", expanded=False):
1059
+ if st.button("Start Google Sign-In", key="google_signin_btn"):
1060
+ auth_url = get_google_auth_url()
1061
+ st.markdown(f"[Click here to authorize]({auth_url})")
1062
+ auth_code = st.text_input("Enter authorization code", key="auth_code_input")
1063
+ if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
1064
+ creds, msg = exchange_code_for_credentials(auth_code)
1065
+ st.session_state.google_creds = creds
1066
+ st.write(msg)
1067
 
1068
  # Main content area
1069
  if mode == "Manual URL":
1070
  st.header("Manual URL Mode")
1071
  url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
1072
 
1073
+ if st.button("Deep Search", key="deep_search_btn"):
1074
+ if url:
1075
+ # Process custom extensions
1076
+ custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
1077
+
1078
+ with st.spinner("Searching for files..."):
1079
+ async def run_deep_search():
1080
+ async with DownloadManager(
1081
+ use_proxy=use_proxy,
1082
+ proxy=proxy,
1083
+ use_stealth=use_stealth
1084
+ ) as dm:
1085
+ files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
1086
+ return files
1087
 
1088
+ files = asyncio.run(run_deep_search())
1089
+
1090
+ if files:
1091
+ st.session_state.discovered_files = files
1092
+ st.session_state.current_url = url
1093
+ st.success(f"Found {len(files)} files!")
1094
+ else:
1095
+ st.warning("No files found.")
 
 
 
 
 
 
 
 
 
 
 
 
1096
 
1097
  # Display and process discovered files
1098
  if st.session_state.discovered_files:
 
1121
  file_info = f"{filename} ({size})"
1122
 
1123
  file_options.append((i, file_info))
 
 
 
 
 
 
1124
 
1125
  # File selection multiselect
1126
  selected_indices = st.multiselect(
 
1133
 
1134
  st.session_state.selected_files = selected_indices
1135
 
1136
+ # Display individual download buttons
1137
  if files:
1138
  st.subheader("Available Files")
1139
  for i, file in enumerate(files):
 
1141
  st.write(f"Source: {file.get('source_url', 'Unknown')}")
1142
  st.write(f"URL: {file['url']}")
1143
 
1144
+ # Download button for this file
1145
+ if st.button(f"Download", key=f"download_single_{i}"):
1146
  with st.spinner(f"Downloading {file['filename']}..."):
1147
  # Create downloads directory
1148
  download_dir = "./downloads"
 
1151
  # Download the file
1152
  async def download_single():
1153
  async with DownloadManager(
 
1154
  use_proxy=use_proxy,
1155
  proxy=proxy,
1156
  use_stealth=use_stealth
 
1183
  if selected_indices:
1184
  st.subheader("Batch Download Options")
1185
 
1186
+ col1, col2, col3 = st.columns(3)
1187
  with col1:
1188
  download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1189
  with col2:
1190
  create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1191
  with col3:
1192
  delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
 
 
1193
 
1194
  if st.button("Download Selected Files", key="batch_download_btn"):
1195
  with st.spinner(f"Downloading {len(selected_indices)} files..."):
 
1203
 
1204
  async def download_batch():
1205
  async with DownloadManager(
 
1206
  use_proxy=use_proxy,
1207
  proxy=proxy,
1208
  use_stealth=use_stealth
 
1243
  key="download_zip_btn"
1244
  )
1245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  # Delete original files if requested
1247
  if delete_after:
1248
  for path in downloaded_paths:
 
1261
 
1262
  if st.button("Search", key="web_search_btn"):
1263
  if query:
1264
+ with st.spinner("Searching..."):
1265
  async def run_search():
1266
  async with DownloadManager(
 
1267
  use_proxy=use_proxy,
1268
  proxy=proxy,
1269
  query=query,
1270
  num_results=num_results,
1271
  use_stealth=use_stealth
1272
  ) as dm:
1273
+ urls = await dm.search_bing()
1274
  return urls
1275
 
1276
  urls = asyncio.run(run_search())
 
1303
  with st.spinner("Searching for files..."):
1304
  async def deep_search_result():
1305
  async with DownloadManager(
 
1306
  use_proxy=use_proxy,
1307
  proxy=proxy,
1308
  use_stealth=use_stealth
 
1318
  else:
1319
  st.warning("No files found on this page.")
1320
 
1321
+ elif mode == "Google Drive" and GOOGLE_DRIVE_AVAILABLE:
1322
+ st.header("Google Drive Download")
1323
 
1324
  # View-only Google Drive download
1325
+ st.write("Download protected/view-only Google Drive documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1326
 
1327
+ file_id = st.text_input(
1328
+ "Google Drive File ID",
1329
+ placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
1330
+ key="drive_file_id"
1331
+ )
1332
+
1333
+ if st.button("Download Document", key="drive_download_btn") and file_id:
1334
+ with st.spinner("Downloading view-only document... (this may take a minute)"):
1335
+ # Create download directory
1336
+ download_dir = "./downloads"
1337
+ os.makedirs(download_dir, exist_ok=True)
1338
+
1339
+ # Set output path
1340
+ output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
1341
+
1342
+ # Download the file
1343
+ async def download_drive_file():
1344
+ async with DownloadManager(
1345
+ use_proxy=use_proxy,
1346
+ proxy=proxy,
1347
+ use_stealth=use_stealth
1348
+ ) as dm:
1349
+ file_info = {
1350
+ 'url': f"https://drive.google.com/file/d/{file_id}/view",
1351
+ 'filename': f"gdrive_{file_id}.pdf",
1352
+ 'metadata': {'file_id': file_id, 'view_only': True}
1353
+ }
1354
+ return await dm.download_viewonly_google_drive(file_info, output_path)
1355
+
1356
+ result_path = asyncio.run(download_drive_file())
1357
+
1358
+ if result_path:
1359
+ st.success("Document downloaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
1360
 
1361
+ # Provide download link
1362
+ with open(result_path, "rb") as f:
1363
+ file_bytes = f.read()
1364
 
1365
+ st.download_button(
1366
+ label="Download PDF",
1367
+ data=file_bytes,
1368
+ file_name=os.path.basename(result_path),
1369
+ mime="application/pdf",
1370
+ key="drive_pdf_download"
1371
+ )
1372
+ else:
1373
+ st.error("Failed to download the document. Please check the file ID and try again.")
 
 
 
 
 
 
 
 
 
1374
 
1375
  # Footer
1376
  st.markdown("---")
1377
+ st.markdown("Created by [Euler314](https://github.com/euler314) | Advanced File Downloader")
1378
 
1379
  # Run the app
1380
  if __name__ == "__main__":