euler314 commited on
Commit
c5e6558
·
verified ·
1 Parent(s): baa351a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +862 -403
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app.py
2
  import streamlit as st
3
  import os
4
  import asyncio
@@ -22,32 +21,31 @@ from PIL import Image
22
  from reportlab.lib.pagesizes import letter
23
  from reportlab.pdfgen import canvas
24
 
25
- # Advanced imports - only import what's installed
26
- import requests
27
- from bs4 import BeautifulSoup
28
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
29
-
30
- # Optional imports with fallbacks
31
- try:
32
- from PyPDF2 import PdfReader
33
- except ImportError:
34
- PdfReader = None
35
-
36
- try:
37
- import google_auth_oauthlib.flow
38
- import googleapiclient.discovery
39
- import google.auth.transport.requests
40
- import googleapiclient.http
41
- GOOGLE_DRIVE_AVAILABLE = True
42
- except ImportError:
43
- GOOGLE_DRIVE_AVAILABLE = False
44
 
45
  # Configure page and logging
46
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
47
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
48
  logger = logging.getLogger(__name__)
49
 
50
- # Google OAuth Config
 
 
 
51
  GOOGLE_OAUTH_CONFIG = {
52
  "web": {
53
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
@@ -60,7 +58,7 @@ GOOGLE_OAUTH_CONFIG = {
60
  }
61
  }
62
 
63
- # User Agent Settings
64
  USER_AGENTS = [
65
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
66
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
@@ -68,9 +66,14 @@ USER_AGENTS = [
68
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
69
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
70
  'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
 
71
  ]
72
 
73
- # Network Interception Configuration
 
 
 
 
74
  NETWORK_INTERCEPTOR_CONFIG = {
75
  "enabled": False,
76
  "intercept_types": ["xhr", "fetch", "document", "media"],
@@ -78,7 +81,7 @@ NETWORK_INTERCEPTOR_CONFIG = {
78
  "intercept_folder": "./intercepted_data"
79
  }
80
 
81
- # Utility Functions
82
  def get_random_user_agent():
83
  return random.choice(USER_AGENTS)
84
 
@@ -114,11 +117,8 @@ def is_valid_file_url(url, extensions):
114
  """Check if URL is a valid file URL based on extension"""
115
  return any(url.lower().endswith(ext) for ext in extensions)
116
 
117
- # Google Drive Functions
118
  def get_google_auth_url():
119
- if not GOOGLE_DRIVE_AVAILABLE:
120
- return None
121
-
122
  client_config = GOOGLE_OAUTH_CONFIG["web"]
123
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
124
  {"web": client_config},
@@ -133,9 +133,6 @@ def get_google_auth_url():
133
  return authorization_url
134
 
135
  def exchange_code_for_credentials(auth_code):
136
- if not GOOGLE_DRIVE_AVAILABLE:
137
- return None, "Google Drive API not available. Install google-auth-oauthlib and google-api-python-client."
138
-
139
  if not auth_code.strip():
140
  return None, "No code provided."
141
  try:
@@ -154,9 +151,6 @@ def exchange_code_for_credentials(auth_code):
154
  return None, f"Error during token exchange: {e}"
155
 
156
  def google_drive_upload(file_path, credentials, folder_id=None):
157
- if not GOOGLE_DRIVE_AVAILABLE:
158
- return "Google Drive API not available"
159
-
160
  try:
161
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
162
  file_metadata = {'name': os.path.basename(file_path)}
@@ -169,59 +163,164 @@ def google_drive_upload(file_path, credentials, folder_id=None):
169
  return f"Error uploading to Drive: {str(e)}"
170
 
171
  def create_drive_folder(drive_service, name):
172
- if not GOOGLE_DRIVE_AVAILABLE:
173
- return None
174
-
175
  folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
176
  folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
177
  return folder.get('id')
178
 
179
- # Setup Playwright
180
- def setup_playwright_dependencies():
181
- """Install required system dependencies for Playwright"""
182
  try:
183
  # Install system dependencies
184
  subprocess.run(['apt-get', 'update', '-y'], check=True)
185
  packages = [
186
  'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
187
  'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
188
- 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
 
189
  ]
190
  subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
191
 
192
- # Install Playwright browser
193
- subprocess.run(['python', '-m', 'playwright', 'install', 'chromium'], check=True)
 
 
 
 
194
 
195
  st.success("Dependencies installed successfully!")
196
  return True
197
  except Exception as e:
198
  st.error(f"Error installing dependencies: {e}")
199
- st.info("You may need to manually install dependencies.")
200
  logger.error(f"Setup error: {e}")
201
  traceback.print_exc()
202
  return False
203
 
204
- # Download Manager Class
205
- class DownloadManager:
206
- def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  self.use_proxy = use_proxy
208
  self.proxy = proxy
209
- self.query = query
210
- self.num_results = num_results
211
- self.use_stealth = use_stealth
212
- self.playwright = None
213
  self.browser = None
214
  self.context = None
215
  self.page = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- # Create intercepted data folder if enabled
218
- if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
219
- os.makedirs(NETWORK_INTERCEPTOR_CONFIG["intercept_folder"], exist_ok=True)
220
-
221
- async def __aenter__(self):
222
  self.playwright = await async_playwright().start()
223
-
224
- # Configure browser launch options
225
  browser_args = [
226
  '--no-sandbox',
227
  '--disable-setuid-sandbox',
@@ -230,7 +329,7 @@ class DownloadManager:
230
  '--disable-features=IsolateOrigins,site-per-process',
231
  ]
232
 
233
- if self.use_stealth:
234
  browser_args.extend([
235
  '--disable-blink-features=AutomationControlled',
236
  '--disable-features=IsolateOrigins'
@@ -244,10 +343,8 @@ class DownloadManager:
244
  if self.use_proxy and self.proxy:
245
  launch_options["proxy"] = {"server": self.proxy}
246
 
247
- # Launch browser
248
  self.browser = await self.playwright.chromium.launch(**launch_options)
249
 
250
- # Configure context options
251
  context_options = {
252
  "viewport": {"width": 1920, "height": 1080},
253
  "user_agent": get_random_user_agent(),
@@ -256,10 +353,10 @@ class DownloadManager:
256
  "accept_downloads": True
257
  }
258
 
259
- # Create context and apply stealth features
260
  self.context = await self.browser.new_context(**context_options)
261
 
262
- if self.use_stealth:
 
263
  await self.context.add_init_script("""
264
  Object.defineProperty(navigator, 'webdriver', { get: () => false });
265
  Object.defineProperty(navigator, 'plugins', {
@@ -269,50 +366,221 @@ class DownloadManager:
269
  window.chrome = { runtime: {} };
270
  """)
271
 
272
- # Create page and set headers
273
  self.page = await self.context.new_page()
274
- await self.page.set_extra_http_headers({
275
- 'Accept-Language': 'en-US,en;q=0.9',
276
- 'Accept-Encoding': 'gzip, deflate, br',
277
- 'DNT': '1',
278
- 'Referer': 'https://www.google.com/',
279
- 'Sec-Fetch-Dest': 'document',
280
- 'Sec-Fetch-Mode': 'navigate',
281
- 'Sec-Fetch-Site': 'cross-site',
282
- 'Sec-Fetch-User': '?1',
283
- 'Upgrade-Insecure-Requests': '1'
284
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  return self
287
 
288
  async def __aexit__(self, exc_type, exc_val, exc_tb):
289
- if self.browser:
290
- await self.browser.close()
291
- if self.playwright:
292
- await self.playwright.stop()
293
 
294
- async def search_bing(self):
295
- """Search Bing for results"""
296
  urls = []
297
  try:
298
- search_url = f"https://www.bing.com/search?q={self.query}"
299
- await self.page.goto(search_url, timeout=30000)
300
- await self.page.wait_for_load_state('networkidle')
 
 
 
 
 
301
 
302
- # Extract search results
303
- links = await self.page.query_selector_all("li.b_algo h2 a")
304
- for link in links[:self.num_results]:
305
- href = await link.get_attribute('href')
306
- if href:
307
- urls.append(href)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  return urls
310
  except Exception as e:
311
- logger.error(f"Error searching Bing: {e}")
312
  return []
313
 
314
  async def get_file_size(self, url):
315
- """Get file size by making a HEAD request"""
316
  try:
317
  headers = {'User-Agent': get_random_user_agent()}
318
  response = requests.head(url, headers=headers, timeout=15)
@@ -325,10 +593,6 @@ class DownloadManager:
325
  return "Unknown Size"
326
 
327
  async def get_pdf_metadata(self, url):
328
- """Extract metadata from PDF files"""
329
- if not PdfReader:
330
- return {}
331
-
332
  try:
333
  headers = {'User-Agent': get_random_user_agent()}
334
  response = requests.get(url, headers=headers, timeout=15, stream=True)
@@ -346,7 +610,6 @@ class DownloadManager:
346
  return {}
347
 
348
  async def extract_real_download_url(self, url):
349
- """Follow redirects to get the final download URL"""
350
  try:
351
  headers = {'User-Agent': get_random_user_agent()}
352
  response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
@@ -356,7 +619,7 @@ class DownloadManager:
356
  return url
357
 
358
  async def get_edu_exam_links(self, url):
359
- """Specialized method for educational exam websites"""
360
  try:
361
  logger.info(f"Fetching exam links from {url}")
362
  links = set()
@@ -367,7 +630,7 @@ class DownloadManager:
367
  response = requests.get(url, headers=headers, timeout=30)
368
 
369
  if response.status_code == 200:
370
- # Parse with BeautifulSoup
371
  soup = BeautifulSoup(response.text, "html.parser")
372
  parsed_base = urlparse(url)
373
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
@@ -377,22 +640,26 @@ class DownloadManager:
377
  href = a["href"]
378
  full_url = urljoin(url, href)
379
 
380
- # Get link text
381
  link_text = a.get_text().lower()
382
 
383
- # Define patterns to look for
384
  url_patterns = [
385
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
386
  "/test/", "/download/", "/files/", "/assignments/",
387
- "paper_", "question_", "exam_", "test_", "past_"
 
 
388
  ]
389
 
390
  text_patterns = [
391
  "exam", "paper", "test", "question", "past", "download",
392
- "assignment", "sample", "study", "material", "notes"
 
 
393
  ]
394
 
395
- # Check for matches
396
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
397
  any(pattern in link_text for pattern in text_patterns) or \
398
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
@@ -400,74 +667,48 @@ class DownloadManager:
400
  except Exception as e:
401
  logger.warning(f"Request-based extraction failed: {e}")
402
 
403
- # Use browser if few links were found or for specific sites
404
  if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
405
  logger.info("Using browser for enhanced link extraction")
406
 
407
- # Navigate to page
408
- await self.page.goto(url, timeout=45000)
409
 
410
- # Get page content
411
- content = await self.page.content()
412
  soup = BeautifulSoup(content, "html.parser")
413
  parsed_base = urlparse(url)
414
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
415
 
416
- # Find links in page
417
  for a in soup.find_all("a", href=True):
418
  href = a["href"]
419
  full_url = urljoin(url, href)
420
  link_text = a.get_text().lower()
421
 
422
- # Use the same patterns as above
423
  url_patterns = [
424
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
425
  "/test/", "/download/", "/files/", "/assignments/",
426
- "paper_", "question_", "exam_", "test_", "past_"
 
 
427
  ]
428
 
429
  text_patterns = [
430
  "exam", "paper", "test", "question", "past", "download",
431
- "assignment", "sample", "study", "material", "notes"
 
 
432
  ]
433
 
 
434
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
435
  any(pattern in link_text for pattern in text_patterns) or \
436
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
437
  links.add(full_url)
438
-
439
- # Try to click on elements that might reveal more links
440
- try:
441
- # Find and click buttons that might show more content
442
- buttons = await self.page.query_selector_all('input[type="button"], button')
443
- for button in buttons:
444
- button_text = await button.text_content() or ""
445
- button_value = await button.get_attribute("value") or ""
446
-
447
- # Only click on promising buttons
448
- if any(keyword in (button_text + button_value).lower() for keyword in
449
- ["show", "view", "display", "list", "exam", "paper", "test"]):
450
- try:
451
- await button.click()
452
- await self.page.wait_for_timeout(1000)
453
-
454
- # Get any new links
455
- new_content = await self.page.content()
456
- new_soup = BeautifulSoup(new_content, "html.parser")
457
- for a in new_soup.find_all("a", href=True):
458
- href = a["href"]
459
- full_url = urljoin(url, href)
460
-
461
- # Check if it's a file link
462
- if any(full_url.lower().endswith(ext) for ext in
463
- ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
464
- links.add(full_url)
465
- except Exception as e:
466
- logger.warning(f"Error clicking button: {e}")
467
- except Exception as e:
468
- logger.warning(f"Error with interactive elements: {e}")
469
 
470
- # Filter links to likely contain exam documents
471
  filtered_links = []
472
  for link in links:
473
  # Common file extensions
@@ -478,7 +719,8 @@ class DownloadManager:
478
  # Common paths for exam documents
479
  if any(pattern in link.lower() for pattern in [
480
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
481
- "/pastpapers/", "/questionpapers/", "/tests/"
 
482
  ]):
483
  filtered_links.append(link)
484
 
@@ -490,7 +732,6 @@ class DownloadManager:
490
  return []
491
 
492
  async def extract_downloadable_files(self, url, custom_ext_list):
493
- """Extract all downloadable files from a webpage"""
494
  found_files = []
495
  try:
496
  # Special handling for educational exam sites
@@ -524,7 +765,7 @@ class DownloadManager:
524
 
525
  # Get metadata for PDFs
526
  meta = {}
527
- if real_url.lower().endswith('.pdf') and PdfReader:
528
  try:
529
  meta = await self.get_pdf_metadata(real_url)
530
  except Exception:
@@ -535,18 +776,18 @@ class DownloadManager:
535
  'filename': filename,
536
  'size': size_str,
537
  'metadata': meta,
538
- 'source_url': url # Keep track of source page
539
  })
540
 
541
  # If we found exam files with the specialized method, return them
542
  if found_files:
543
  return found_files
544
 
545
- # Standard extraction method for regular websites
546
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
547
 
548
  # Get page content
549
- content = await self.page.content()
550
  soup = BeautifulSoup(content, 'html.parser')
551
 
552
  # Define file extensions to look for
@@ -566,7 +807,7 @@ class DownloadManager:
566
 
567
  # Handle PHP and download links separately
568
  if '.php' in href.lower() or 'download' in href.lower():
569
- full_url = href if href.startswith('http') else urljoin(url, href)
570
  real_url = await self.extract_real_download_url(full_url)
571
  if real_url and real_url != full_url:
572
  filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
@@ -581,10 +822,10 @@ class DownloadManager:
581
 
582
  # Check for direct file extensions
583
  if any(href.lower().endswith(ext) for ext in all_exts):
584
- file_url = href if href.startswith('http') else urljoin(url, href)
585
  size_str = await self.get_file_size(file_url)
586
  meta = {}
587
- if file_url.lower().endswith('.pdf') and PdfReader:
588
  meta = await self.get_pdf_metadata(file_url)
589
  found_files.append({
590
  'url': file_url,
@@ -604,7 +845,7 @@ class DownloadManager:
604
  break
605
 
606
  if file_id:
607
- # Determine if it's view-only
608
  is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
609
 
610
  filename = f"gdrive_{file_id}"
@@ -628,7 +869,7 @@ class DownloadManager:
628
  for elem in soup.find_all(elem_tag):
629
  src = elem.get('src') or elem.get('data')
630
  if src and any(src.lower().endswith(ext) for ext in all_exts):
631
- file_url = src if src.startswith('http') else urljoin(url, src)
632
  found_files.append({
633
  'url': file_url,
634
  'filename': os.path.basename(file_url.split('?')[0]),
@@ -652,12 +893,12 @@ class DownloadManager:
652
  return []
653
 
654
  async def download_file(self, file_info, save_dir, referer=None):
655
- """Download a file and save it to disk"""
656
  file_url = file_info['url']
657
  fname = file_info['filename']
658
  referer = referer or file_info.get('source_url', 'https://www.google.com')
659
 
660
- # Create unique filename
661
  path = os.path.join(save_dir, fname)
662
  base, ext = os.path.splitext(fname)
663
  counter = 1
@@ -670,7 +911,7 @@ class DownloadManager:
670
  try:
671
  # Special handling for Google Drive files
672
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
673
- # For view-only Google Drive files
674
  is_view_only = file_info.get('metadata', {}).get('view_only', False)
675
  if is_view_only:
676
  result_path = await self.download_viewonly_google_drive(file_info, path)
@@ -726,7 +967,7 @@ class DownloadManager:
726
  return None
727
 
728
  async def download_viewonly_google_drive(self, file_info, save_path):
729
- """Download view-only Google Drive documents using Playwright"""
730
  try:
731
  # Extract file ID
732
  file_id = file_info.get('metadata', {}).get('file_id')
@@ -752,147 +993,173 @@ class DownloadManager:
752
 
753
  logger.info(f"Downloading view-only Google Drive file: {file_id}")
754
 
755
- # Create a dedicated browser instance for this operation
756
- async with async_playwright() as p:
757
- browser = await p.chromium.launch(
758
- headless=True,
759
- args=[
760
- '--no-sandbox',
761
- '--disable-setuid-sandbox',
762
- '--disable-dev-shm-usage',
763
- '--disable-web-security',
764
- '--disable-features=IsolateOrigins,site-per-process',
765
- '--disable-blink-features=AutomationControlled'
766
- ]
767
- )
768
-
769
- # Create context
770
- context = await browser.new_context(
771
- viewport={'width': 1600, 'height': 1200},
772
- user_agent=get_random_user_agent(),
773
- accept_downloads=True
774
- )
775
 
776
- # Add stealth script
777
- await context.add_init_script("""
778
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
779
- Object.defineProperty(navigator, 'plugins', {
780
- get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
781
- });
782
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
783
- window.chrome = { runtime: {} };
784
- """)
785
-
786
- page = await context.new_page()
787
-
788
- try:
789
- # Navigate to the file
790
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
791
- await page.wait_for_load_state('networkidle')
792
- await page.wait_for_timeout(5000) # Wait for rendering
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
 
794
- # Create temp directory
795
- temp_dir = tempfile.mkdtemp()
796
 
797
- # For PDF files, take screenshots of each page
798
- if file_type == 'pdf':
799
- # Create directory for screenshots
800
- screenshots_dir = os.path.join(temp_dir, "screenshots")
801
- os.makedirs(screenshots_dir, exist_ok=True)
802
 
803
- # Get page count estimation
804
- total_pages = await page.evaluate("""
805
- () => {
806
- // Look for page counters
807
- const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
808
- const text = el.textContent || '';
809
- return /\\d+\\s*\\/\\s*\\d+/.test(text);
810
- });
811
-
812
- if (pageCounters.length > 0) {
813
- const text = pageCounters[0].textContent || '';
814
- const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
815
- if (match && match[2]) return parseInt(match[2]);
816
- }
817
-
818
- // Look for page elements
819
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
820
- if (pages.length > 0) return pages.length;
821
-
822
- // Default
823
- return 20;
824
- }
825
- """)
826
 
827
- logger.info(f"PDF has approximately {total_pages} pages")
 
828
 
829
- # Capture screenshots page by page
830
- screenshots = []
831
- for i in range(min(total_pages, 100)): # Limit to 100 pages
832
- try:
833
- # Go to specific page
834
- if i > 0:
835
- next_button = await page.query_selector('button[aria-label="Next page"]')
836
- if next_button:
837
- await next_button.click()
838
- await page.wait_for_timeout(1000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  else:
840
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
 
842
- # Take screenshot
843
- screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
 
 
 
 
844
 
845
- # Try to find page element
846
- page_element = await page.query_selector('.drive-viewer-paginated-page')
847
- if page_element:
848
- await page_element.screenshot(path=screenshot_path)
849
- else:
850
- await page.screenshot(path=screenshot_path)
851
 
852
- screenshots.append(screenshot_path)
853
- except Exception as e:
854
- logger.error(f"Error capturing page {i+1}: {e}")
855
- continue
856
-
857
- # Create PDF from screenshots
858
- if screenshots:
859
- # Get dimensions from first screenshot
860
- first_img = Image.open(screenshots[0])
861
- width, height = first_img.size
862
-
863
- # Create PDF
864
- c = canvas.Canvas(save_path, pagesize=(width, height))
865
- for screenshot in screenshots:
866
- c.drawImage(screenshot, 0, 0, width, height)
867
- c.showPage()
868
- c.save()
869
 
870
- # Clean up screenshots
871
- for screenshot in screenshots:
872
- os.remove(screenshot)
873
 
874
- # Clean up temp directory
 
875
  shutil.rmtree(temp_dir, ignore_errors=True)
876
 
877
  return save_path
878
- else:
879
- logger.error("No screenshots captured")
880
- else:
881
- # For non-PDF files, just take a screenshot
882
- screenshot_path = os.path.join(temp_dir, "file.png")
883
- await page.screenshot(path=screenshot_path)
884
-
885
- # Copy to destination
886
- shutil.copy(screenshot_path, save_path)
887
-
888
- # Clean up
889
- os.remove(screenshot_path)
890
- shutil.rmtree(temp_dir, ignore_errors=True)
891
-
892
- return save_path
893
-
894
- finally:
895
- await browser.close()
896
 
897
  return None
898
  except Exception as e:
@@ -900,7 +1167,7 @@ class DownloadManager:
900
  return None
901
 
902
  async def get_sublinks(self, url, limit=10000):
903
- """Extract all sublinks from a webpage"""
904
  links = set()
905
  try:
906
  logger.info(f"Extracting sublinks from {url}")
@@ -916,17 +1183,18 @@ class DownloadManager:
916
  logger.info(f"Found {len(links)} sublinks with specialized method")
917
  return list(links)[:limit]
918
 
919
- # Navigate to the page
920
- await self.page.goto(url, timeout=30000)
921
 
922
  # Get page content
923
- content = await self.page.content()
924
  soup = BeautifulSoup(content, 'html.parser')
925
 
926
- # Extract all links from the page
927
  parsed_base = urlparse(url)
928
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
929
 
 
930
  for a in soup.find_all('a', href=True):
931
  href = a['href']
932
  if href and not href.startswith('javascript:') and not href.startswith('#'):
@@ -952,12 +1220,85 @@ class DownloadManager:
952
  logger.error(f"Error extracting sublinks: {e}")
953
  return list(links)[:limit]
954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
956
- """Perform deep search for files on website and its subpages"""
957
  if not custom_ext_list:
958
  custom_ext_list = []
959
 
960
- # Set up progress indicators
961
  progress_text = st.empty()
962
  progress_bar = st.progress(0)
963
  file_count_text = st.empty()
@@ -976,23 +1317,22 @@ class DownloadManager:
976
  total_links = len(sublinks)
977
  progress_text.text(f"Found {total_links} sublinks to process")
978
 
979
- # Always include main page files
980
  all_files = main_files.copy()
981
 
982
- # Process each sublink if there are any
983
- if sublinks:
984
- for i, sublink in enumerate(sublinks, 1):
985
- progress = i / max(total_links, 1) # Avoid division by zero
986
- progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
987
- progress_bar.progress(progress)
988
-
989
- try:
990
- # Extract files from sublink
991
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
992
- all_files.extend(sub_files)
993
- file_count_text.text(f"Found {len(all_files)} total files")
994
- except Exception as e:
995
- logger.warning(f"Error processing sublink {sublink}: {e}")
996
 
997
  # Deduplicate files
998
  seen_urls = set()
@@ -1020,7 +1360,7 @@ class DownloadManager:
1020
  progress_text.empty()
1021
  progress_bar.empty()
1022
 
1023
- # Main App
1024
  def main():
1025
  st.title("Advanced File Downloader")
1026
 
@@ -1029,70 +1369,91 @@ def main():
1029
  st.session_state.initialized = True
1030
  st.session_state.discovered_files = []
1031
  st.session_state.current_url = None
 
1032
  st.session_state.selected_files = []
1033
  st.session_state.do_deep_search = False
1034
  st.session_state.deep_search_url = None
1035
  st.session_state.search_results = []
1036
  st.session_state.download_urls = {} # For direct download links
1037
 
1038
- # Install Playwright if needed
1039
- if "playwright_installed" not in st.session_state:
1040
- with st.spinner("Setting up Playwright. This may take a minute..."):
1041
- st.session_state.playwright_installed = setup_playwright_dependencies()
 
1042
 
1043
- # Sidebar settings
1044
  with st.sidebar:
1045
- mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Google Drive"], key="mode_select")
1046
 
1047
- with st.expander("Advanced Options", expanded=True):
 
 
1048
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
1049
  help="Enter extensions like .csv, .txt")
1050
  max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
1051
  sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
 
 
1052
  use_proxy = st.checkbox("Use Proxy", key="use_proxy")
1053
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
1054
  use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
1055
  help="Makes browser harder to detect as automated")
 
 
 
 
 
 
 
 
 
 
 
 
1056
 
1057
- if GOOGLE_DRIVE_AVAILABLE:
1058
- with st.expander("Google Drive Integration", expanded=False):
1059
- if st.button("Start Google Sign-In", key="google_signin_btn"):
1060
- auth_url = get_google_auth_url()
1061
- st.markdown(f"[Click here to authorize]({auth_url})")
1062
- auth_code = st.text_input("Enter authorization code", key="auth_code_input")
1063
- if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
1064
- creds, msg = exchange_code_for_credentials(auth_code)
1065
- st.session_state.google_creds = creds
1066
- st.write(msg)
1067
 
1068
  # Main content area
1069
  if mode == "Manual URL":
1070
  st.header("Manual URL Mode")
1071
  url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
1072
 
1073
- if st.button("Deep Search", key="deep_search_btn"):
1074
- if url:
1075
- # Process custom extensions
1076
- custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
1077
-
1078
- with st.spinner("Searching for files..."):
1079
- async def run_deep_search():
1080
- async with DownloadManager(
1081
- use_proxy=use_proxy,
1082
- proxy=proxy,
1083
- use_stealth=use_stealth
1084
- ) as dm:
1085
- files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
1086
- return files
1087
-
1088
- files = asyncio.run(run_deep_search())
1089
 
1090
- if files:
1091
- st.session_state.discovered_files = files
1092
- st.session_state.current_url = url
1093
- st.success(f"Found {len(files)} files!")
1094
- else:
1095
- st.warning("No files found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1096
 
1097
  # Display and process discovered files
1098
  if st.session_state.discovered_files:
@@ -1121,6 +1482,12 @@ def main():
1121
  file_info = f"{filename} ({size})"
1122
 
1123
  file_options.append((i, file_info))
 
 
 
 
 
 
1124
 
1125
  # File selection multiselect
1126
  selected_indices = st.multiselect(
@@ -1133,7 +1500,7 @@ def main():
1133
 
1134
  st.session_state.selected_files = selected_indices
1135
 
1136
- # Display individual download buttons
1137
  if files:
1138
  st.subheader("Available Files")
1139
  for i, file in enumerate(files):
@@ -1141,8 +1508,8 @@ def main():
1141
  st.write(f"Source: {file.get('source_url', 'Unknown')}")
1142
  st.write(f"URL: {file['url']}")
1143
 
1144
- # Download button for this file
1145
- if st.button(f"Download", key=f"download_single_{i}"):
1146
  with st.spinner(f"Downloading {file['filename']}..."):
1147
  # Create downloads directory
1148
  download_dir = "./downloads"
@@ -1151,6 +1518,7 @@ def main():
1151
  # Download the file
1152
  async def download_single():
1153
  async with DownloadManager(
 
1154
  use_proxy=use_proxy,
1155
  proxy=proxy,
1156
  use_stealth=use_stealth
@@ -1183,13 +1551,15 @@ def main():
1183
  if selected_indices:
1184
  st.subheader("Batch Download Options")
1185
 
1186
- col1, col2, col3 = st.columns(3)
1187
  with col1:
1188
  download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1189
  with col2:
1190
  create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1191
  with col3:
1192
  delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
 
 
1193
 
1194
  if st.button("Download Selected Files", key="batch_download_btn"):
1195
  with st.spinner(f"Downloading {len(selected_indices)} files..."):
@@ -1203,6 +1573,7 @@ def main():
1203
 
1204
  async def download_batch():
1205
  async with DownloadManager(
 
1206
  use_proxy=use_proxy,
1207
  proxy=proxy,
1208
  use_stealth=use_stealth
@@ -1243,6 +1614,24 @@ def main():
1243
  key="download_zip_btn"
1244
  )
1245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  # Delete original files if requested
1247
  if delete_after:
1248
  for path in downloaded_paths:
@@ -1261,16 +1650,17 @@ def main():
1261
 
1262
  if st.button("Search", key="web_search_btn"):
1263
  if query:
1264
- with st.spinner("Searching..."):
1265
  async def run_search():
1266
  async with DownloadManager(
 
1267
  use_proxy=use_proxy,
1268
  proxy=proxy,
1269
  query=query,
1270
  num_results=num_results,
1271
  use_stealth=use_stealth
1272
  ) as dm:
1273
- urls = await dm.search_bing()
1274
  return urls
1275
 
1276
  urls = asyncio.run(run_search())
@@ -1303,6 +1693,7 @@ def main():
1303
  with st.spinner("Searching for files..."):
1304
  async def deep_search_result():
1305
  async with DownloadManager(
 
1306
  use_proxy=use_proxy,
1307
  proxy=proxy,
1308
  use_stealth=use_stealth
@@ -1318,63 +1709,131 @@ def main():
1318
  else:
1319
  st.warning("No files found on this page.")
1320
 
1321
- elif mode == "Google Drive" and GOOGLE_DRIVE_AVAILABLE:
1322
- st.header("Google Drive Download")
1323
 
1324
  # View-only Google Drive download
1325
- st.write("Download protected/view-only Google Drive documents")
1326
-
1327
- file_id = st.text_input(
1328
- "Google Drive File ID",
1329
- placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
1330
- key="drive_file_id"
1331
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1332
 
1333
- if st.button("Download Document", key="drive_download_btn") and file_id:
1334
- with st.spinner("Downloading view-only document... (this may take a minute)"):
1335
- # Create download directory
1336
- download_dir = "./downloads"
1337
- os.makedirs(download_dir, exist_ok=True)
1338
-
1339
- # Set output path
1340
- output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
1341
-
1342
- # Download the file
1343
- async def download_drive_file():
1344
- async with DownloadManager(
1345
- use_proxy=use_proxy,
1346
- proxy=proxy,
1347
- use_stealth=use_stealth
1348
- ) as dm:
1349
- file_info = {
1350
- 'url': f"https://drive.google.com/file/d/{file_id}/view",
1351
- 'filename': f"gdrive_{file_id}.pdf",
1352
- 'metadata': {'file_id': file_id, 'view_only': True}
1353
- }
1354
- return await dm.download_viewonly_google_drive(file_info, output_path)
1355
-
1356
- result_path = asyncio.run(download_drive_file())
1357
-
1358
- if result_path:
1359
- st.success("Document downloaded successfully!")
1360
 
1361
- # Provide download link
1362
- with open(result_path, "rb") as f:
1363
- file_bytes = f.read()
 
 
1364
 
1365
- st.download_button(
1366
- label="Download PDF",
1367
- data=file_bytes,
1368
- file_name=os.path.basename(result_path),
1369
- mime="application/pdf",
1370
- key="drive_pdf_download"
1371
- )
1372
- else:
1373
- st.error("Failed to download the document. Please check the file ID and try again.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1374
 
1375
  # Footer
1376
  st.markdown("---")
1377
- st.markdown("Created by [Euler314](https://github.com/euler314) | Advanced File Downloader")
1378
 
1379
  # Run the app
1380
  if __name__ == "__main__":
 
 
1
  import streamlit as st
2
  import os
3
  import asyncio
 
21
  from reportlab.lib.pagesizes import letter
22
  from reportlab.pdfgen import canvas
23
 
24
+ # Advanced imports
 
 
25
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
26
+ from bs4 import BeautifulSoup
27
+ from PyPDF2 import PdfReader
28
+ import google_auth_oauthlib.flow
29
+ import googleapiclient.discovery
30
+ import google.auth.transport.requests
31
+ import googleapiclient.http
32
+ import requests
33
+ import celery
34
+ from celery import Celery
35
+ import splash
36
+ import pyppeteer
37
+ import mitmproxy
38
+ from mitmproxy import http
 
 
39
 
40
  # Configure page and logging
41
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
42
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
43
  logger = logging.getLogger(__name__)
44
 
45
+ # Initialize Celery for distributed task processing
46
+ celery_app = Celery('file_downloader', broker='redis://localhost:6379/0')
47
+
48
+ # Configure Google OAuth
49
  GOOGLE_OAUTH_CONFIG = {
50
  "web": {
51
  "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
 
58
  }
59
  }
60
 
61
+ # -------------------- User Agent Settings --------------------
62
  USER_AGENTS = [
63
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
64
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
 
66
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
67
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
68
  'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
69
+ 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
70
  ]
71
 
72
+ # -------------------- Proxy Management --------------------
73
+ PROXY_POOL = []
74
+ CURRENT_PROXY_INDEX = 0
75
+
76
+ # -------------------- Network Interception Configuration --------------------
77
  NETWORK_INTERCEPTOR_CONFIG = {
78
  "enabled": False,
79
  "intercept_types": ["xhr", "fetch", "document", "media"],
 
81
  "intercept_folder": "./intercepted_data"
82
  }
83
 
84
+ # -------------------- Utility Functions --------------------
85
  def get_random_user_agent():
86
  return random.choice(USER_AGENTS)
87
 
 
117
  """Check if URL is a valid file URL based on extension"""
118
  return any(url.lower().endswith(ext) for ext in extensions)
119
 
120
+ # -------------------- Google Drive Functions --------------------
121
  def get_google_auth_url():
 
 
 
122
  client_config = GOOGLE_OAUTH_CONFIG["web"]
123
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
124
  {"web": client_config},
 
133
  return authorization_url
134
 
135
  def exchange_code_for_credentials(auth_code):
 
 
 
136
  if not auth_code.strip():
137
  return None, "No code provided."
138
  try:
 
151
  return None, f"Error during token exchange: {e}"
152
 
153
  def google_drive_upload(file_path, credentials, folder_id=None):
 
 
 
154
  try:
155
  drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
156
  file_metadata = {'name': os.path.basename(file_path)}
 
163
  return f"Error uploading to Drive: {str(e)}"
164
 
165
  def create_drive_folder(drive_service, name):
 
 
 
166
  folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
167
  folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
168
  return folder.get('id')
169
 
170
+ # -------------------- Setup Functions --------------------
171
+ def setup_dependencies():
172
+ """Install required system dependencies"""
173
  try:
174
  # Install system dependencies
175
  subprocess.run(['apt-get', 'update', '-y'], check=True)
176
  packages = [
177
  'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
178
  'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
179
+ 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0',
180
+ 'redis-server', 'python3-dev', 'build-essential'
181
  ]
182
  subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
183
 
184
+ # Install Python packages
185
+ subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True)
186
+
187
+ # Install browsers
188
+ subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
189
+ subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True)
190
 
191
  st.success("Dependencies installed successfully!")
192
  return True
193
  except Exception as e:
194
  st.error(f"Error installing dependencies: {e}")
195
+ st.info("You may need to manually install dependencies. Check console for details.")
196
  logger.error(f"Setup error: {e}")
197
  traceback.print_exc()
198
  return False
199
 
200
+ def check_services():
201
+ """Check if required services are running"""
202
+ try:
203
+ # Check Redis for Celery
204
+ redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG'
205
+ if not redis_running:
206
+ # Try to start Redis
207
+ subprocess.run(['service', 'redis-server', 'start'], check=True)
208
+
209
+ # Create directories for intercepted data
210
+ os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True)
211
+
212
+ return True
213
+ except Exception as e:
214
+ logger.error(f"Service check error: {e}")
215
+ return False
216
+
217
+ # -------------------- Network Interception Classes --------------------
218
+ class NetworkInterceptor:
219
+ """Class to intercept network traffic using mitmproxy"""
220
+
221
+ def __init__(self, intercept_types=None, save_path=None):
222
+ self.intercept_types = intercept_types or ["xhr", "fetch", "document"]
223
+ self.save_path = save_path or "./intercepted_data"
224
+ os.makedirs(self.save_path, exist_ok=True)
225
+ self.captured_data = []
226
+
227
+ def intercept_request(self, flow):
228
+ """Process intercepted requests"""
229
+ try:
230
+ url = flow.request.url
231
+ method = flow.request.method
232
+ content_type = flow.request.headers.get("Content-Type", "")
233
+
234
+ # Log the request
235
+ self.captured_data.append({
236
+ "type": "request",
237
+ "url": url,
238
+ "method": method,
239
+ "headers": dict(flow.request.headers),
240
+ "timestamp": time.time()
241
+ })
242
+
243
+ logger.info(f"Intercepted {method} request to {url}")
244
+ except Exception as e:
245
+ logger.error(f"Error intercepting request: {e}")
246
+
247
+ def intercept_response(self, flow):
248
+ """Process intercepted responses"""
249
+ try:
250
+ url = flow.request.url
251
+ status_code = flow.response.status_code
252
+ content_type = flow.response.headers.get("Content-Type", "")
253
+
254
+ # Only process responses of interest based on content type
255
+ if any(t in content_type.lower() for t in ["application/pdf", "application/msword",
256
+ "application/vnd.openxmlformats",
257
+ "application/zip"]):
258
+ # Save the file
259
+ filename = os.path.basename(urlparse(url).path)
260
+ if not filename or filename == '/':
261
+ filename = f"file_{int(time.time())}"
262
+
263
+ # Try to add extension based on content type
264
+ if "pdf" in content_type:
265
+ filename += ".pdf"
266
+ elif "msword" in content_type:
267
+ filename += ".doc"
268
+ elif "openxmlformats" in content_type and "wordprocessingml" in content_type:
269
+ filename += ".docx"
270
+ elif "zip" in content_type:
271
+ filename += ".zip"
272
+
273
+ file_path = os.path.join(self.save_path, filename)
274
+ with open(file_path, "wb") as f:
275
+ f.write(flow.response.content)
276
+
277
+ logger.info(f"Saved intercepted file: {file_path}")
278
+
279
+ # Record metadata about the captured file
280
+ self.captured_data.append({
281
+ "type": "file",
282
+ "url": url,
283
+ "content_type": content_type,
284
+ "size": len(flow.response.content),
285
+ "path": file_path,
286
+ "timestamp": time.time()
287
+ })
288
+ except Exception as e:
289
+ logger.error(f"Error intercepting response: {e}")
290
+
291
+ def get_captured_files(self):
292
+ """Return list of captured files"""
293
+ return [item for item in self.captured_data if item["type"] == "file"]
294
+
295
+ # -------------------- Browser Automation Classes --------------------
296
+ class MultiEngineBrowser:
297
+ """Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)"""
298
+
299
+ def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True):
300
+ self.engine = engine
301
  self.use_proxy = use_proxy
302
  self.proxy = proxy
303
+ self.stealth = stealth
 
 
 
304
  self.browser = None
305
  self.context = None
306
  self.page = None
307
+
308
+ async def setup(self):
309
+ """Initialize browser based on selected engine"""
310
+ if self.engine == "playwright":
311
+ return await self.setup_playwright()
312
+ elif self.engine == "pyppeteer":
313
+ return await self.setup_pyppeteer()
314
+ elif self.engine == "splash":
315
+ return await self.setup_splash()
316
+ else:
317
+ raise ValueError(f"Unsupported browser engine: {self.engine}")
318
+
319
+ async def setup_playwright(self):
320
+ """Setup Playwright browser"""
321
+ from playwright.async_api import async_playwright
322
 
 
 
 
 
 
323
  self.playwright = await async_playwright().start()
 
 
324
  browser_args = [
325
  '--no-sandbox',
326
  '--disable-setuid-sandbox',
 
329
  '--disable-features=IsolateOrigins,site-per-process',
330
  ]
331
 
332
+ if self.stealth:
333
  browser_args.extend([
334
  '--disable-blink-features=AutomationControlled',
335
  '--disable-features=IsolateOrigins'
 
343
  if self.use_proxy and self.proxy:
344
  launch_options["proxy"] = {"server": self.proxy}
345
 
 
346
  self.browser = await self.playwright.chromium.launch(**launch_options)
347
 
 
348
  context_options = {
349
  "viewport": {"width": 1920, "height": 1080},
350
  "user_agent": get_random_user_agent(),
 
353
  "accept_downloads": True
354
  }
355
 
 
356
  self.context = await self.browser.new_context(**context_options)
357
 
358
+ # Apply stealth features
359
+ if self.stealth:
360
  await self.context.add_init_script("""
361
  Object.defineProperty(navigator, 'webdriver', { get: () => false });
362
  Object.defineProperty(navigator, 'plugins', {
 
366
  window.chrome = { runtime: {} };
367
  """)
368
 
 
369
  self.page = await self.context.new_page()
370
+ return self.page
371
+
372
+ async def setup_pyppeteer(self):
373
+ """Setup Pyppeteer browser"""
374
+ from pyppeteer import launch
375
+
376
+ browser_args = [
377
+ '--no-sandbox',
378
+ '--disable-setuid-sandbox',
379
+ '--disable-dev-shm-usage',
380
+ '--disable-web-security',
381
+ ]
382
+
383
+ if self.stealth:
384
+ browser_args.extend([
385
+ '--disable-blink-features=AutomationControlled',
386
+ '--disable-features=IsolateOrigins'
387
+ ])
388
+
389
+ launch_options = {
390
+ "headless": True,
391
+ "args": browser_args,
392
+ "ignoreHTTPSErrors": True,
393
+ "userDataDir": tempfile.mkdtemp()
394
+ }
395
+
396
+ if self.use_proxy and self.proxy:
397
+ browser_args.append(f'--proxy-server={self.proxy}')
398
+
399
+ self.browser = await launch(launch_options)
400
+ self.page = await self.browser.newPage()
401
+
402
+ # Set user agent
403
+ await self.page.setUserAgent(get_random_user_agent())
404
+
405
+ # Set viewport
406
+ await self.page.setViewport({"width": 1920, "height": 1080})
407
+
408
+ # Apply stealth features
409
+ if self.stealth:
410
+ await self.page.evaluateOnNewDocument("""
411
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
412
+ Object.defineProperty(navigator, 'plugins', {
413
+ get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
414
+ });
415
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
416
+ window.chrome = { runtime: {} };
417
+ """)
418
+
419
+ return self.page
420
+
421
+ async def setup_splash(self):
422
+ """Setup Splash browser through API"""
423
+ # Splash is typically used via HTTP API
424
+ # We'll use requests for this
425
+ self.splash_url = "http://localhost:8050/render.html"
426
+ return None # No actual page object for Splash
427
+
428
+ async def goto(self, url, wait_until=None, timeout=30000):
429
+ """Navigate to a URL"""
430
+ if self.engine == "playwright":
431
+ return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout)
432
+ elif self.engine == "pyppeteer":
433
+ return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout)
434
+ elif self.engine == "splash":
435
+ # Use Splash HTTP API
436
+ params = {
437
+ "url": url,
438
+ "wait": min(timeout/1000, 30), # Splash uses seconds
439
+ "timeout": min(timeout/1000, 60),
440
+ "resource_timeout": min(timeout/1000, 30),
441
+ "html": 1,
442
+ "png": 0,
443
+ "render_all": 1
444
+ }
445
+
446
+ if self.use_proxy and self.proxy:
447
+ params["proxy"] = self.proxy
448
+
449
+ headers = {"User-Agent": get_random_user_agent()}
450
+ response = requests.get(self.splash_url, params=params, headers=headers)
451
+ self.last_html = response.text
452
+ return response
453
+
454
+ async def content(self):
455
+ """Get page content"""
456
+ if self.engine == "playwright":
457
+ return await self.page.content()
458
+ elif self.engine == "pyppeteer":
459
+ return await self.page.content()
460
+ elif self.engine == "splash":
461
+ return self.last_html
462
+
463
+ async def close(self):
464
+ """Close browser"""
465
+ if self.engine == "playwright":
466
+ if self.browser:
467
+ await self.browser.close()
468
+ if self.playwright:
469
+ await self.playwright.stop()
470
+ elif self.engine == "pyppeteer":
471
+ if self.browser:
472
+ await self.browser.close()
473
+ # No cleanup needed for Splash as it's stateless
474
+
475
+ # -------------------- Download Manager Class --------------------
476
+ class DownloadManager:
477
+ def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
478
+ self.browser_engine = browser_engine
479
+ self.use_proxy = use_proxy
480
+ self.proxy = proxy
481
+ self.query = query
482
+ self.num_results = num_results
483
+ self.use_stealth = use_stealth
484
+ self.browser = None
485
+ self.network_interceptor = None
486
+
487
+ # Configure network interception if enabled
488
+ if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
489
+ self.network_interceptor = NetworkInterceptor(
490
+ intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"],
491
+ save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"]
492
+ )
493
+
494
+ async def __aenter__(self):
495
+ # Initialize multi-engine browser
496
+ self.browser = MultiEngineBrowser(
497
+ engine=self.browser_engine,
498
+ use_proxy=self.use_proxy,
499
+ proxy=self.proxy,
500
+ stealth=self.use_stealth
501
+ )
502
+ self.page = await self.browser.setup()
503
+
504
+ # Set headers for better stealth
505
+ if self.browser_engine == "playwright":
506
+ await self.page.set_extra_http_headers({
507
+ 'Accept-Language': 'en-US,en;q=0.9',
508
+ 'Accept-Encoding': 'gzip, deflate, br',
509
+ 'DNT': '1',
510
+ 'Referer': 'https://www.google.com/',
511
+ 'Sec-Fetch-Dest': 'document',
512
+ 'Sec-Fetch-Mode': 'navigate',
513
+ 'Sec-Fetch-Site': 'cross-site',
514
+ 'Sec-Fetch-User': '?1',
515
+ 'Upgrade-Insecure-Requests': '1'
516
+ })
517
 
518
  return self
519
 
520
  async def __aexit__(self, exc_type, exc_val, exc_tb):
521
+ await self.browser.close()
 
 
 
522
 
523
+ async def search_web(self, search_engine="bing"):
524
+ """Search web using specified search engine"""
525
  urls = []
526
  try:
527
+ if search_engine == "bing":
528
+ search_url = f"https://www.bing.com/search?q={self.query}"
529
+ elif search_engine == "google":
530
+ search_url = f"https://www.google.com/search?q={self.query}"
531
+ else:
532
+ raise ValueError(f"Unsupported search engine: {search_engine}")
533
+
534
+ await self.browser.goto(search_url, timeout=30000)
535
 
536
+ if self.browser_engine == "playwright":
537
+ if search_engine == "bing":
538
+ links = await self.page.query_selector_all("li.b_algo h2 a")
539
+ for link in links[:self.num_results]:
540
+ href = await link.get_attribute('href')
541
+ if href:
542
+ urls.append(href)
543
+ elif search_engine == "google":
544
+ links = await self.page.query_selector_all("div.g a[href^='http']")
545
+ for link in links[:self.num_results]:
546
+ href = await link.get_attribute('href')
547
+ if href:
548
+ urls.append(href)
549
+ elif self.browser_engine == "pyppeteer":
550
+ if search_engine == "bing":
551
+ links = await self.page.querySelectorAll("li.b_algo h2 a")
552
+ for link in links[:self.num_results]:
553
+ href = await self.page.evaluate('el => el.getAttribute("href")', link)
554
+ if href:
555
+ urls.append(href)
556
+ elif search_engine == "google":
557
+ links = await self.page.querySelectorAll("div.g a[href^='http']")
558
+ for link in links[:self.num_results]:
559
+ href = await self.page.evaluate('el => el.getAttribute("href")', link)
560
+ if href:
561
+ urls.append(href)
562
+ elif self.browser_engine == "splash":
563
+ # Parse the HTML with BeautifulSoup
564
+ soup = BeautifulSoup(self.browser.last_html, 'html.parser')
565
+ if search_engine == "bing":
566
+ links = soup.select("li.b_algo h2 a")
567
+ for link in links[:self.num_results]:
568
+ href = link.get("href")
569
+ if href:
570
+ urls.append(href)
571
+ elif search_engine == "google":
572
+ links = soup.select("div.g a[href^='http']")
573
+ for link in links[:self.num_results]:
574
+ href = link.get("href")
575
+ if href:
576
+ urls.append(href)
577
 
578
  return urls
579
  except Exception as e:
580
+ logger.error(f"Error searching web: {e}")
581
  return []
582
 
583
  async def get_file_size(self, url):
 
584
  try:
585
  headers = {'User-Agent': get_random_user_agent()}
586
  response = requests.head(url, headers=headers, timeout=15)
 
593
  return "Unknown Size"
594
 
595
  async def get_pdf_metadata(self, url):
 
 
 
 
596
  try:
597
  headers = {'User-Agent': get_random_user_agent()}
598
  response = requests.get(url, headers=headers, timeout=15, stream=True)
 
610
  return {}
611
 
612
  async def extract_real_download_url(self, url):
 
613
  try:
614
  headers = {'User-Agent': get_random_user_agent()}
615
  response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
 
619
  return url
620
 
621
  async def get_edu_exam_links(self, url):
622
+ """Specialized method for educational exam websites that follows a common pattern."""
623
  try:
624
  logger.info(f"Fetching exam links from {url}")
625
  links = set()
 
630
  response = requests.get(url, headers=headers, timeout=30)
631
 
632
  if response.status_code == 200:
633
+ # Parse with BeautifulSoup for efficiency
634
  soup = BeautifulSoup(response.text, "html.parser")
635
  parsed_base = urlparse(url)
636
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
640
  href = a["href"]
641
  full_url = urljoin(url, href)
642
 
643
+ # Look for text clues
644
  link_text = a.get_text().lower()
645
 
646
+ # Special patterns for exam sites (expanded list)
647
  url_patterns = [
648
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
649
  "/test/", "/download/", "/files/", "/assignments/",
650
+ "paper_", "question_", "exam_", "test_", "past_",
651
+ "assignment_", "sample_", "study_material", "notes_",
652
+ "/resource/", "/subject/", "/course/", "/material/"
653
  ]
654
 
655
  text_patterns = [
656
  "exam", "paper", "test", "question", "past", "download",
657
+ "assignment", "sample", "study", "material", "notes",
658
+ "subject", "course", "resource", "pdf", "document",
659
+ "view", "open", "get", "solution", "answer"
660
  ]
661
 
662
+ # Check URL and text patterns
663
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
664
  any(pattern in link_text for pattern in text_patterns) or \
665
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
 
667
  except Exception as e:
668
  logger.warning(f"Request-based extraction failed: {e}")
669
 
670
+ # Use browser-based approach if needed
671
  if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
672
  logger.info("Using browser for enhanced link extraction")
673
 
674
+ # Navigate to the page
675
+ await self.browser.goto(url, timeout=45000)
676
 
677
+ # Get page content and parse with BeautifulSoup
678
+ content = await self.browser.content()
679
  soup = BeautifulSoup(content, "html.parser")
680
  parsed_base = urlparse(url)
681
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
682
 
683
+ # Process all links on the page
684
  for a in soup.find_all("a", href=True):
685
  href = a["href"]
686
  full_url = urljoin(url, href)
687
  link_text = a.get_text().lower()
688
 
689
+ # Apply the same filtering criteria
690
  url_patterns = [
691
  "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
692
  "/test/", "/download/", "/files/", "/assignments/",
693
+ "paper_", "question_", "exam_", "test_", "past_",
694
+ "assignment_", "sample_", "study_material", "notes_",
695
+ "/resource/", "/subject/", "/course/", "/material/"
696
  ]
697
 
698
  text_patterns = [
699
  "exam", "paper", "test", "question", "past", "download",
700
+ "assignment", "sample", "study", "material", "notes",
701
+ "subject", "course", "resource", "pdf", "document",
702
+ "view", "open", "get", "solution", "answer"
703
  ]
704
 
705
+ # Check URL and text patterns
706
  if any(pattern in full_url.lower() for pattern in url_patterns) or \
707
  any(pattern in link_text for pattern in text_patterns) or \
708
  any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
709
  links.add(full_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
+ # Filter to likely exam documents
712
  filtered_links = []
713
  for link in links:
714
  # Common file extensions
 
719
  # Common paths for exam documents
720
  if any(pattern in link.lower() for pattern in [
721
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
722
+ "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
723
+ "/resource/", "/material/", "/notes/", "/subjectmaterial/"
724
  ]):
725
  filtered_links.append(link)
726
 
 
732
  return []
733
 
734
  async def extract_downloadable_files(self, url, custom_ext_list):
 
735
  found_files = []
736
  try:
737
  # Special handling for educational exam sites
 
765
 
766
  # Get metadata for PDFs
767
  meta = {}
768
+ if real_url.lower().endswith('.pdf'):
769
  try:
770
  meta = await self.get_pdf_metadata(real_url)
771
  except Exception:
 
776
  'filename': filename,
777
  'size': size_str,
778
  'metadata': meta,
779
+ 'source_url': url # Add source URL for better tracking
780
  })
781
 
782
  # If we found exam files with the specialized method, return them
783
  if found_files:
784
  return found_files
785
 
786
+ # Standard extraction method for all pages
787
+ await self.browser.goto(url, timeout=30000)
788
 
789
  # Get page content
790
+ content = await self.browser.content()
791
  soup = BeautifulSoup(content, 'html.parser')
792
 
793
  # Define file extensions to look for
 
807
 
808
  # Handle PHP and download links separately
809
  if '.php' in href.lower() or 'download' in href.lower():
810
+ full_url = href if href.startswith('http') else urljoin(base_url, href)
811
  real_url = await self.extract_real_download_url(full_url)
812
  if real_url and real_url != full_url:
813
  filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
 
822
 
823
  # Check for direct file extensions
824
  if any(href.lower().endswith(ext) for ext in all_exts):
825
+ file_url = href if href.startswith('http') else urljoin(base_url, href)
826
  size_str = await self.get_file_size(file_url)
827
  meta = {}
828
+ if file_url.lower().endswith('.pdf'):
829
  meta = await self.get_pdf_metadata(file_url)
830
  found_files.append({
831
  'url': file_url,
 
845
  break
846
 
847
  if file_id:
848
+ # Determine if it's a view-only file
849
  is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
850
 
851
  filename = f"gdrive_{file_id}"
 
869
  for elem in soup.find_all(elem_tag):
870
  src = elem.get('src') or elem.get('data')
871
  if src and any(src.lower().endswith(ext) for ext in all_exts):
872
+ file_url = src if src.startswith('http') else urljoin(base_url, src)
873
  found_files.append({
874
  'url': file_url,
875
  'filename': os.path.basename(file_url.split('?')[0]),
 
893
  return []
894
 
895
  async def download_file(self, file_info, save_dir, referer=None):
896
+ """Download a file and provide a direct download link"""
897
  file_url = file_info['url']
898
  fname = file_info['filename']
899
  referer = referer or file_info.get('source_url', 'https://www.google.com')
900
 
901
+ # Create unique filename to avoid overwriting
902
  path = os.path.join(save_dir, fname)
903
  base, ext = os.path.splitext(fname)
904
  counter = 1
 
911
  try:
912
  # Special handling for Google Drive files
913
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
914
+ # For view-only Google Drive files, use specialized method
915
  is_view_only = file_info.get('metadata', {}).get('view_only', False)
916
  if is_view_only:
917
  result_path = await self.download_viewonly_google_drive(file_info, path)
 
967
  return None
968
 
969
  async def download_viewonly_google_drive(self, file_info, save_path):
970
+ """Download view-only Google Drive documents"""
971
  try:
972
  # Extract file ID
973
  file_id = file_info.get('metadata', {}).get('file_id')
 
993
 
994
  logger.info(f"Downloading view-only Google Drive file: {file_id}")
995
 
996
+ # Create a dedicated browser session
997
+ if self.browser_engine == "playwright":
998
+ from playwright.async_api import async_playwright
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
+ async with async_playwright() as p:
1001
+ browser = await p.chromium.launch(
1002
+ headless=True,
1003
+ args=[
1004
+ '--no-sandbox',
1005
+ '--disable-setuid-sandbox',
1006
+ '--disable-dev-shm-usage',
1007
+ '--disable-web-security',
1008
+ '--disable-features=IsolateOrigins,site-per-process',
1009
+ '--disable-site-isolation-trials',
1010
+ '--disable-blink-features=AutomationControlled'
1011
+ ]
1012
+ )
1013
+
1014
+ # Create context with options for better handling
1015
+ context = await browser.new_context(
1016
+ viewport={'width': 1600, 'height': 1200},
1017
+ user_agent=get_random_user_agent(),
1018
+ accept_downloads=True,
1019
+ ignore_https_errors=True
1020
+ )
1021
+
1022
+ # Add stealth script
1023
+ await context.add_init_script("""
1024
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
1025
+ Object.defineProperty(navigator, 'plugins', {
1026
+ get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
1027
+ });
1028
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
1029
+ window.chrome = { runtime: {} };
1030
+ """)
1031
 
1032
+ page = await context.new_page()
 
1033
 
1034
+ try:
1035
+ # Visit the file
1036
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
1037
+ await page.wait_for_load_state('networkidle')
 
1038
 
1039
+ # Wait for content to load
1040
+ await page.wait_for_timeout(5000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
 
1042
+ # Create temporary directory for processing
1043
+ temp_dir = tempfile.mkdtemp()
1044
 
1045
+ # For PDF handling
1046
+ if file_type == 'pdf':
1047
+ # Create directory for screenshots
1048
+ screenshots_dir = os.path.join(temp_dir, "screenshots")
1049
+ os.makedirs(screenshots_dir, exist_ok=True)
1050
+
1051
+ # Get page count
1052
+ total_pages = await page.evaluate("""
1053
+ () => {
1054
+ // Look for page counters in the interface
1055
+ const pageCounters = document.querySelectorAll('*');
1056
+ for (const el of pageCounters) {
1057
+ const text = el.textContent || '';
1058
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
1059
+ if (match && match[2]) {
1060
+ return parseInt(match[2]);
1061
+ }
1062
+ }
1063
+
1064
+ // Look for paginated pages
1065
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1066
+ if (pages.length > 0) return pages.length;
1067
+
1068
+ // Default if we can't determine
1069
+ return 20;
1070
+ }
1071
+ """)
1072
+
1073
+ logger.info(f"PDF has approximately {total_pages} pages")
1074
+
1075
+ # Take screenshots of each page
1076
+ screenshots = []
1077
+
1078
+ # First try with the page element method
1079
+ for i in range(min(total_pages, 100)): # Limit to 100 pages for safety
1080
+ try:
1081
+ # Navigate to specific page
1082
+ if i > 0:
1083
+ await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()")
1084
+ await page.wait_for_timeout(500)
1085
+
1086
+ # Wait for the page to render
1087
+ await page.wait_for_timeout(500)
1088
+
1089
+ # Take screenshot
1090
+ screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
1091
+
1092
+ # Try to find the page element
1093
+ page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})')
1094
+ if page_element:
1095
+ await page_element.screenshot(path=screenshot_path)
1096
  else:
1097
+ # Fallback to viewport screenshot
1098
+ await page.screenshot(path=screenshot_path)
1099
+
1100
+ screenshots.append(screenshot_path)
1101
+
1102
+ # Check if we should continue to next page
1103
+ if i < total_pages - 1:
1104
+ next_button = await page.query_selector('button[aria-label="Next page"]')
1105
+ if next_button:
1106
+ # Check if button is disabled
1107
+ is_disabled = await next_button.get_attribute('disabled')
1108
+ if is_disabled:
1109
+ logger.info(f"Reached last page at page {i+1}")
1110
+ break
1111
+
1112
+ # Click next page
1113
+ await next_button.click()
1114
+ await page.wait_for_timeout(1000)
1115
+ else:
1116
+ logger.info("Next page button not found")
1117
+ break
1118
+ except Exception as e:
1119
+ logger.error(f"Error capturing page {i+1}: {e}")
1120
+ continue
1121
+
1122
+ # Create PDF from screenshots
1123
+ if screenshots:
1124
+ # Get dimensions from first screenshot
1125
+ first_img = Image.open(screenshots[0])
1126
+ width, height = first_img.size
1127
 
1128
+ # Create PDF
1129
+ c = canvas.Canvas(save_path, pagesize=(width, height))
1130
+ for screenshot in screenshots:
1131
+ c.drawImage(screenshot, 0, 0, width, height)
1132
+ c.showPage()
1133
+ c.save()
1134
 
1135
+ # Clean up screenshots
1136
+ for screenshot in screenshots:
1137
+ os.remove(screenshot)
 
 
 
1138
 
1139
+ # Clean up temp directory
1140
+ shutil.rmtree(temp_dir, ignore_errors=True)
1141
+
1142
+ return save_path
1143
+ else:
1144
+ logger.error("No screenshots captured")
1145
+ else:
1146
+ # For non-PDF files, just take a screenshot
1147
+ screenshot_path = os.path.join(temp_dir, "file.png")
1148
+ await page.screenshot(path=screenshot_path)
 
 
 
 
 
 
 
1149
 
1150
+ # Copy to destination
1151
+ shutil.copy(screenshot_path, save_path)
 
1152
 
1153
+ # Clean up
1154
+ os.remove(screenshot_path)
1155
  shutil.rmtree(temp_dir, ignore_errors=True)
1156
 
1157
  return save_path
1158
+ finally:
1159
+ await browser.close()
1160
+ elif self.browser_engine == "pyppeteer":
1161
+ # Similar implementation for Pyppeteer
1162
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
1163
 
1164
  return None
1165
  except Exception as e:
 
1167
  return None
1168
 
1169
  async def get_sublinks(self, url, limit=10000):
1170
+ """Extract all sublinks from a website"""
1171
  links = set()
1172
  try:
1173
  logger.info(f"Extracting sublinks from {url}")
 
1183
  logger.info(f"Found {len(links)} sublinks with specialized method")
1184
  return list(links)[:limit]
1185
 
1186
+ # Standard link extraction for all sites
1187
+ await self.browser.goto(url, timeout=30000)
1188
 
1189
  # Get page content
1190
+ content = await self.browser.content()
1191
  soup = BeautifulSoup(content, 'html.parser')
1192
 
1193
+ # Get base URL for resolving relative links
1194
  parsed_base = urlparse(url)
1195
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1196
 
1197
+ # Extract all links from the page
1198
  for a in soup.find_all('a', href=True):
1199
  href = a['href']
1200
  if href and not href.startswith('javascript:') and not href.startswith('#'):
 
1220
  logger.error(f"Error extracting sublinks: {e}")
1221
  return list(links)[:limit]
1222
 
1223
+ @celery_app.task
1224
+ def download_file_task(file_info, save_dir, referer=None):
1225
+ """Celery task for downloading files asynchronously"""
1226
+ # This function runs in a separate worker process
1227
+ file_url = file_info['url']
1228
+ fname = file_info['filename']
1229
+ referer = referer or file_info.get('source_url', 'https://www.google.com')
1230
+
1231
+ # Create unique filename
1232
+ path = os.path.join(save_dir, fname)
1233
+ base, ext = os.path.splitext(fname)
1234
+ counter = 1
1235
+ while os.path.exists(path):
1236
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
1237
+ counter += 1
1238
+
1239
+ os.makedirs(save_dir, exist_ok=True)
1240
+
1241
+ try:
1242
+ # Handle Google Drive files
1243
+ if "drive.google.com" in file_url or "docs.google.com" in file_url:
1244
+ # Extract file ID
1245
+ file_id = None
1246
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
1247
+ match = re.search(pattern, file_url)
1248
+ if match:
1249
+ file_id = match.group(1)
1250
+ break
1251
+
1252
+ if file_id:
1253
+ # Try direct download
1254
+ download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
1255
+ headers = {
1256
+ 'User-Agent': get_random_user_agent(),
1257
+ 'Referer': referer
1258
+ }
1259
+
1260
+ with requests.get(download_url, headers=headers, stream=True) as r:
1261
+ if r.status_code == 200:
1262
+ with open(path, 'wb') as f:
1263
+ for chunk in r.iter_content(chunk_size=8192):
1264
+ f.write(chunk)
1265
+
1266
+ # Check if this is HTML (common for Google Drive restrictions)
1267
+ with open(path, 'rb') as f:
1268
+ content_start = f.read(100).decode('utf-8', errors='ignore')
1269
+ if '<html' in content_start.lower():
1270
+ os.remove(path)
1271
+ return {'status': 'error', 'message': 'Received HTML instead of file'}
1272
+
1273
+ return {'status': 'success', 'path': path}
1274
+
1275
+ # Standard download for regular files
1276
+ headers = {
1277
+ 'User-Agent': get_random_user_agent(),
1278
+ 'Referer': referer,
1279
+ 'Accept': '*/*',
1280
+ 'Accept-Encoding': 'gzip, deflate, br'
1281
+ }
1282
+
1283
+ with requests.get(file_url, headers=headers, stream=True) as r:
1284
+ if r.status_code == 200:
1285
+ with open(path, 'wb') as f:
1286
+ for chunk in r.iter_content(chunk_size=8192):
1287
+ f.write(chunk)
1288
+
1289
+ return {'status': 'success', 'path': path}
1290
+ else:
1291
+ return {'status': 'error', 'message': f"HTTP error: {r.status_code}"}
1292
+
1293
+ except Exception as e:
1294
+ return {'status': 'error', 'message': str(e)}
1295
+
1296
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
1297
+ """Perform deep search for files on a website and its subpages"""
1298
  if not custom_ext_list:
1299
  custom_ext_list = []
1300
 
1301
+ # Create progress indicators
1302
  progress_text = st.empty()
1303
  progress_bar = st.progress(0)
1304
  file_count_text = st.empty()
 
1317
  total_links = len(sublinks)
1318
  progress_text.text(f"Found {total_links} sublinks to process")
1319
 
1320
+ # Initialize all_files with main_files to ensure they're included
1321
  all_files = main_files.copy()
1322
 
1323
+ # Process each sublink
1324
+ for i, sublink in enumerate(sublinks, 1):
1325
+ progress = i / max(total_links, 1) # Avoid division by zero
1326
+ progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
1327
+ progress_bar.progress(progress)
1328
+
1329
+ try:
1330
+ # Extract files from sublink
1331
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1332
+ all_files.extend(sub_files)
1333
+ file_count_text.text(f"Found {len(all_files)} total files")
1334
+ except Exception as e:
1335
+ logger.warning(f"Error processing sublink {sublink}: {e}")
 
1336
 
1337
  # Deduplicate files
1338
  seen_urls = set()
 
1360
  progress_text.empty()
1361
  progress_bar.empty()
1362
 
1363
+ # -------------------- Main App --------------------
1364
  def main():
1365
  st.title("Advanced File Downloader")
1366
 
 
1369
  st.session_state.initialized = True
1370
  st.session_state.discovered_files = []
1371
  st.session_state.current_url = None
1372
+ st.session_state.google_creds = None
1373
  st.session_state.selected_files = []
1374
  st.session_state.do_deep_search = False
1375
  st.session_state.deep_search_url = None
1376
  st.session_state.search_results = []
1377
  st.session_state.download_urls = {} # For direct download links
1378
 
1379
+ # Install dependencies if needed
1380
+ if "dependencies_installed" not in st.session_state:
1381
+ with st.spinner("Setting up dependencies. This may take a minute..."):
1382
+ st.session_state.dependencies_installed = setup_dependencies()
1383
+ check_services()
1384
 
1385
+ # Sidebar options
1386
  with st.sidebar:
1387
+ mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select")
1388
 
1389
+ with st.expander("Search Options", expanded=True):
1390
+ search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine")
1391
+ browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine")
1392
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
1393
  help="Enter extensions like .csv, .txt")
1394
  max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
1395
  sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
1396
+
1397
+ with st.expander("Advanced Options", expanded=False):
1398
  use_proxy = st.checkbox("Use Proxy", key="use_proxy")
1399
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
1400
  use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
1401
  help="Makes browser harder to detect as automated")
1402
+ enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"],
1403
+ key="enable_intercept",
1404
+ help="Intercept network traffic to find additional files")
1405
+ if enable_network_intercept:
1406
+ NETWORK_INTERCEPTOR_CONFIG["enabled"] = True
1407
+ intercept_types = st.multiselect("Intercept Types",
1408
+ ["xhr", "fetch", "document", "media", "stylesheet", "image", "font"],
1409
+ default=["xhr", "fetch", "document", "media"],
1410
+ key="intercept_types")
1411
+ NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types
1412
+ else:
1413
+ NETWORK_INTERCEPTOR_CONFIG["enabled"] = False
1414
 
1415
+ with st.expander("Google Drive Integration", expanded=False):
1416
+ if st.button("Start Google Sign-In", key="google_signin_btn"):
1417
+ auth_url = get_google_auth_url()
1418
+ st.markdown(f"[Click here to authorize]({auth_url})")
1419
+ auth_code = st.text_input("Enter authorization code", key="auth_code_input")
1420
+ if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
1421
+ creds, msg = exchange_code_for_credentials(auth_code)
1422
+ st.session_state.google_creds = creds
1423
+ st.write(msg)
 
1424
 
1425
  # Main content area
1426
  if mode == "Manual URL":
1427
  st.header("Manual URL Mode")
1428
  url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
1429
 
1430
+ col1, col2 = st.columns([3, 1])
1431
+ with col1:
1432
+ if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
1433
+ if url:
1434
+ # Process custom extensions
1435
+ custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
 
 
 
 
 
 
 
 
 
 
1436
 
1437
+ with st.spinner("Searching for files..."):
1438
+ async def run_deep_search():
1439
+ async with DownloadManager(
1440
+ browser_engine=browser_engine,
1441
+ use_proxy=use_proxy,
1442
+ proxy=proxy,
1443
+ use_stealth=use_stealth
1444
+ ) as dm:
1445
+ files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
1446
+ return files
1447
+
1448
+ # Run the search
1449
+ files = asyncio.run(run_deep_search())
1450
+
1451
+ if files:
1452
+ st.session_state.discovered_files = files
1453
+ st.session_state.current_url = url
1454
+ st.success(f"Found {len(files)} files!")
1455
+ else:
1456
+ st.warning("No files found.")
1457
 
1458
  # Display and process discovered files
1459
  if st.session_state.discovered_files:
 
1482
  file_info = f"{filename} ({size})"
1483
 
1484
  file_options.append((i, file_info))
1485
+
1486
+ # Generate direct download URL for this file
1487
+ if i not in st.session_state.download_urls:
1488
+ # Generate a unique key for this file
1489
+ file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode()
1490
+ st.session_state.download_urls[i] = file_key
1491
 
1492
  # File selection multiselect
1493
  selected_indices = st.multiselect(
 
1500
 
1501
  st.session_state.selected_files = selected_indices
1502
 
1503
+ # Display individual files with direct download links
1504
  if files:
1505
  st.subheader("Available Files")
1506
  for i, file in enumerate(files):
 
1508
  st.write(f"Source: {file.get('source_url', 'Unknown')}")
1509
  st.write(f"URL: {file['url']}")
1510
 
1511
+ # Download button for this specific file
1512
+ if st.button(f"Download this file", key=f"download_single_{i}"):
1513
  with st.spinner(f"Downloading {file['filename']}..."):
1514
  # Create downloads directory
1515
  download_dir = "./downloads"
 
1518
  # Download the file
1519
  async def download_single():
1520
  async with DownloadManager(
1521
+ browser_engine=browser_engine,
1522
  use_proxy=use_proxy,
1523
  proxy=proxy,
1524
  use_stealth=use_stealth
 
1551
  if selected_indices:
1552
  st.subheader("Batch Download Options")
1553
 
1554
+ col1, col2, col3, col4 = st.columns(4)
1555
  with col1:
1556
  download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1557
  with col2:
1558
  create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1559
  with col3:
1560
  delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
1561
+ with col4:
1562
+ upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
1563
 
1564
  if st.button("Download Selected Files", key="batch_download_btn"):
1565
  with st.spinner(f"Downloading {len(selected_indices)} files..."):
 
1573
 
1574
  async def download_batch():
1575
  async with DownloadManager(
1576
+ browser_engine=browser_engine,
1577
  use_proxy=use_proxy,
1578
  proxy=proxy,
1579
  use_stealth=use_stealth
 
1614
  key="download_zip_btn"
1615
  )
1616
 
1617
+ # Upload to Google Drive if requested
1618
+ if upload_to_drive and st.session_state.google_creds:
1619
+ with st.spinner("Uploading to Google Drive..."):
1620
+ drive_service = googleapiclient.discovery.build(
1621
+ "drive", "v3", credentials=st.session_state.google_creds
1622
+ )
1623
+ folder_id = create_drive_folder(
1624
+ drive_service, f"Downloads_{get_domain(url)}"
1625
+ )
1626
+ drive_id = google_drive_upload(
1627
+ zip_path, st.session_state.google_creds, folder_id
1628
+ )
1629
+
1630
+ if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
1631
+ st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
1632
+ else:
1633
+ st.error(drive_id)
1634
+
1635
  # Delete original files if requested
1636
  if delete_after:
1637
  for path in downloaded_paths:
 
1650
 
1651
  if st.button("Search", key="web_search_btn"):
1652
  if query:
1653
+ with st.spinner("Searching the web..."):
1654
  async def run_search():
1655
  async with DownloadManager(
1656
+ browser_engine=browser_engine,
1657
  use_proxy=use_proxy,
1658
  proxy=proxy,
1659
  query=query,
1660
  num_results=num_results,
1661
  use_stealth=use_stealth
1662
  ) as dm:
1663
+ urls = await dm.search_web(search_engine)
1664
  return urls
1665
 
1666
  urls = asyncio.run(run_search())
 
1693
  with st.spinner("Searching for files..."):
1694
  async def deep_search_result():
1695
  async with DownloadManager(
1696
+ browser_engine=browser_engine,
1697
  use_proxy=use_proxy,
1698
  proxy=proxy,
1699
  use_stealth=use_stealth
 
1709
  else:
1710
  st.warning("No files found on this page.")
1711
 
1712
+ elif mode == "Single File":
1713
+ st.header("Single File Download")
1714
 
1715
  # View-only Google Drive download
1716
+ with st.expander("Download View-Only Google Drive Document", expanded=True):
1717
+ st.write("Download protected/view-only Google Drive documents")
1718
+
1719
+ file_id = st.text_input(
1720
+ "Google Drive File ID",
1721
+ placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
1722
+ key="drive_file_id"
1723
+ )
1724
+
1725
+ if st.button("Download Document", key="drive_download_btn") and file_id:
1726
+ with st.spinner("Downloading view-only document... (this may take a minute)"):
1727
+ # Create download directory
1728
+ download_dir = "./downloads"
1729
+ os.makedirs(download_dir, exist_ok=True)
1730
+
1731
+ # Set output path
1732
+ output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
1733
+
1734
+ # Download the file
1735
+ async def download_drive_file():
1736
+ async with DownloadManager(
1737
+ browser_engine=browser_engine,
1738
+ use_proxy=use_proxy,
1739
+ proxy=proxy,
1740
+ use_stealth=use_stealth
1741
+ ) as dm:
1742
+ file_info = {
1743
+ 'url': f"https://drive.google.com/file/d/{file_id}/view",
1744
+ 'filename': f"gdrive_{file_id}.pdf",
1745
+ 'metadata': {'file_id': file_id, 'view_only': True}
1746
+ }
1747
+ return await dm.download_viewonly_google_drive(file_info, output_path)
1748
+
1749
+ result_path = asyncio.run(download_drive_file())
1750
+
1751
+ if result_path:
1752
+ st.success("Document downloaded successfully!")
1753
+
1754
+ # Provide download link
1755
+ with open(result_path, "rb") as f:
1756
+ file_bytes = f.read()
1757
+
1758
+ st.download_button(
1759
+ label="Download PDF",
1760
+ data=file_bytes,
1761
+ file_name=os.path.basename(result_path),
1762
+ mime="application/pdf",
1763
+ key="drive_pdf_download"
1764
+ )
1765
+ else:
1766
+ st.error("Failed to download the document. Please check the file ID and try again.")
1767
 
1768
+ # Direct URL download
1769
+ with st.expander("Download from Direct URL", expanded=True):
1770
+ st.write("Download a file from a direct URL")
1771
+
1772
+ file_url = st.text_input(
1773
+ "File URL",
1774
+ placeholder="https://example.com/file.pdf",
1775
+ key="direct_url"
1776
+ )
1777
+
1778
+ file_name = st.text_input(
1779
+ "Save as (optional)",
1780
+ placeholder="Leave blank to use original filename",
1781
+ key="save_filename"
1782
+ )
1783
+
1784
+ if st.button("Download File", key="direct_download_btn") and file_url:
1785
+ with st.spinner("Downloading file..."):
1786
+ # Create download directory
1787
+ download_dir = "./downloads"
1788
+ os.makedirs(download_dir, exist_ok=True)
 
 
 
 
 
 
1789
 
1790
+ # Determine filename
1791
+ if not file_name:
1792
+ file_name = os.path.basename(urlparse(file_url).path)
1793
+ if not file_name or file_name == '/':
1794
+ file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}"
1795
 
1796
+ # Create file info
1797
+ file_info = {
1798
+ 'url': file_url,
1799
+ 'filename': file_name,
1800
+ 'metadata': {}
1801
+ }
1802
+
1803
+ # Download the file
1804
+ async def download_direct_file():
1805
+ async with DownloadManager(
1806
+ browser_engine=browser_engine,
1807
+ use_proxy=use_proxy,
1808
+ proxy=proxy,
1809
+ use_stealth=use_stealth
1810
+ ) as dm:
1811
+ return await dm.download_file(file_info, download_dir)
1812
+
1813
+ file_path = asyncio.run(download_direct_file())
1814
+
1815
+ if file_path:
1816
+ st.success(f"File downloaded successfully to {file_path}")
1817
+
1818
+ # Provide download link
1819
+ with open(file_path, "rb") as f:
1820
+ file_bytes = f.read()
1821
+
1822
+ mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
1823
+
1824
+ st.download_button(
1825
+ label=f"Download {os.path.basename(file_path)}",
1826
+ data=file_bytes,
1827
+ file_name=os.path.basename(file_path),
1828
+ mime=mime_type,
1829
+ key="direct_file_download"
1830
+ )
1831
+ else:
1832
+ st.error("Failed to download the file. Please check the URL and try again.")
1833
 
1834
  # Footer
1835
  st.markdown("---")
1836
+ st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies")
1837
 
1838
  # Run the app
1839
  if __name__ == "__main__":