euler314 commited on
Commit
52f13e6
·
verified ·
1 Parent(s): baece32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1212 -129
app.py CHANGED
@@ -22,6 +22,8 @@ import datetime
22
  import traceback
23
  import base64
24
  import shutil
 
 
25
  from PIL import Image
26
  from reportlab.lib.pagesizes import letter
27
  from reportlab.pdfgen import canvas
@@ -50,14 +52,44 @@ GOOGLE_OAUTH_CONFIG = {
50
  }
51
  }
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # -------------------- Utility Functions --------------------
54
  def get_random_user_agent():
55
- USER_AGENTS = [
56
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
57
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
58
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
59
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
60
- ]
61
  return random.choice(USER_AGENTS)
62
 
63
  def sizeof_fmt(num, suffix='B'):
@@ -75,6 +107,42 @@ def create_zip_file(file_paths, output_dir):
75
  zipf.write(file_path, os.path.basename(file_path))
76
  return zip_path
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # -------------------- Google Drive Functions --------------------
79
  def get_google_auth_url():
80
  client_config = GOOGLE_OAUTH_CONFIG["web"]
@@ -153,7 +221,7 @@ def install_playwright_dependencies():
153
 
154
  # -------------------- Download Manager Class --------------------
155
  class DownloadManager:
156
- def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
157
  self.use_proxy = use_proxy
158
  self.proxy = proxy
159
  self.query = query
@@ -162,30 +230,140 @@ class DownloadManager:
162
  self.browser = None
163
  self.context = None
164
  self.page = None
 
 
 
 
 
165
 
166
  async def __aenter__(self):
167
  self.playwright = await async_playwright().start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  opts = {
169
  "headless": True,
170
- "args": [
171
- '--no-sandbox',
172
- '--disable-setuid-sandbox',
173
- '--disable-dev-shm-usage',
174
- '--disable-gpu',
175
- '--no-zygote',
176
- '--single-process'
177
- ]
178
  }
 
 
179
  if self.use_proxy and self.proxy:
180
  opts["proxy"] = {"server": self.proxy}
 
 
181
  self.browser = await self.playwright.chromium.launch(**opts)
182
- self.context = await self.browser.new_context(user_agent=get_random_user_agent())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  self.page = await self.context.new_page()
184
  await self.page.set_extra_http_headers({
185
- 'Accept-Language': 'en-US,en;q=0.9',
186
  'Accept-Encoding': 'gzip, deflate, br',
187
- 'Referer': 'https://www.bing.com/'
 
 
 
 
 
 
 
 
188
  })
 
 
 
 
 
 
189
  return self
190
 
191
  async def __aexit__(self, exc_type, exc_val, exc_tb):
@@ -194,17 +372,140 @@ class DownloadManager:
194
  if self.playwright:
195
  await self.playwright.stop()
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  async def search_bing(self):
198
  urls = []
199
  try:
 
 
 
200
  search_url = f"https://www.bing.com/search?q={self.query}"
201
  await self.page.goto(search_url, timeout=30000)
202
  await self.page.wait_for_load_state('networkidle')
 
 
 
 
 
 
 
 
 
 
 
203
  links = await self.page.query_selector_all("li.b_algo h2 a")
204
  for link in links[:self.num_results]:
205
  href = await link.get_attribute('href')
206
  if href:
207
  urls.append(href)
 
 
 
 
 
 
 
 
 
 
 
208
  return urls
209
  except Exception as e:
210
  logger.error(f"Error searching Bing: {e}")
@@ -212,6 +513,8 @@ class DownloadManager:
212
 
213
  async def get_file_size(self, url):
214
  try:
 
 
215
  async with self.context.new_page() as page:
216
  response = await page.request.head(url, timeout=15000)
217
  length = response.headers.get('Content-Length', None)
@@ -219,11 +522,14 @@ class DownloadManager:
219
  return sizeof_fmt(int(length))
220
  else:
221
  return "Unknown Size"
222
- except Exception:
 
223
  return "Unknown Size"
224
 
225
  async def get_pdf_metadata(self, url):
226
  try:
 
 
227
  async with self.context.new_page() as page:
228
  resp = await page.request.get(url, timeout=15000)
229
  if resp.ok:
@@ -237,11 +543,14 @@ class DownloadManager:
237
  }
238
  else:
239
  return {}
240
- except Exception:
 
241
  return {}
242
 
243
  async def extract_real_download_url(self, url):
244
  try:
 
 
245
  async with self.context.new_page() as page:
246
  response = await page.goto(url, wait_until='networkidle', timeout=30000)
247
  if response and response.headers.get('location'):
@@ -258,8 +567,15 @@ class DownloadManager:
258
  logger.info(f"Fetching exam links from {url}")
259
  links = set()
260
 
261
- # Use requests for a faster initial scan
262
- headers = {"User-Agent": get_random_user_agent()}
 
 
 
 
 
 
 
263
  try:
264
  response = requests.get(url, headers=headers, timeout=30)
265
 
@@ -274,77 +590,195 @@ class DownloadManager:
274
  href = a["href"]
275
  full_url = urljoin(url, href)
276
 
277
- # Special patterns for exam sites
278
- for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
279
- "/test/", "/download/", "/files/", "/assignments/",
280
- "paper_", "question_", "exam_", "test_", "past_"]:
281
- if pattern in full_url.lower():
282
- links.add(full_url)
283
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  except Exception as e:
285
  logger.warning(f"Request-based extraction failed: {e}")
286
 
287
- # If we didn't find many links with direct approach, use Playwright for more thorough extraction
288
- if len(links) < 5:
289
- logger.info("Using browser for enhanced link extraction")
290
- try:
291
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
 
 
 
292
 
293
- # Extract all links with Playwright
294
- page_links = await self.page.evaluate("""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  () => {
296
- const links = [];
 
 
297
  const anchors = document.querySelectorAll('a[href]');
298
  for (const a of anchors) {
299
  if (a.href) {
300
- links.push({
301
  href: a.href,
302
- text: a.innerText || a.textContent || ''
 
303
  });
304
  }
305
  }
306
- return links;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
  """)
309
 
310
- # Process extracted links
311
- for link_info in page_links:
312
  href = link_info.get('href', '')
313
  text = link_info.get('text', '').lower()
314
 
315
- if href:
316
- # Check for exam-related patterns in URL or link text
317
- url_patterns = ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
318
- "/test/", "/download/", "/files/", "/assignments/",
319
- "paper_", "question_", "exam_", "test_", "past_"]
 
 
 
320
 
321
- text_patterns = ["exam", "paper", "test", "question", "past", "download"]
 
 
 
 
 
322
 
323
  if any(pattern in href.lower() for pattern in url_patterns) or \
324
- any(pattern in text for pattern in text_patterns):
 
 
325
  links.add(href)
326
 
327
  # Check for ASP.NET specific elements that might contain exam links
328
- grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
329
  for grid in grid_elements:
330
  grid_links = await grid.query_selector_all('a[href]')
331
  for a in grid_links:
332
  href = await a.get_attribute('href')
 
 
333
  if href:
334
  full_url = href if href.startswith('http') else urljoin(url, href)
335
  links.add(full_url)
336
 
337
- # Try clicking any controls that might reveal more exam links
338
- buttons = await self.page.query_selector_all('input[type="button"], button')
339
- for button in buttons:
340
- button_text = await button.text_content() or ""
341
- button_value = await button.get_attribute("value") or ""
342
- if any(keyword in (button_text + button_value).lower() for keyword in
343
- ["show", "view", "display", "list", "exam", "paper", "test"]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  try:
 
345
  await button.click()
346
- await self.page.wait_for_timeout(1000)
347
- await self.page.wait_for_load_state('networkidle', timeout=5000)
348
 
349
  # Get any new links that appeared
350
  new_links = await self.page.query_selector_all('a[href]')
@@ -352,24 +786,67 @@ class DownloadManager:
352
  href = await a.get_attribute('href')
353
  if href:
354
  full_url = href if href.startswith('http') else urljoin(url, href)
355
- links.add(full_url)
 
 
 
 
 
356
  except Exception as e:
357
  logger.warning(f"Error clicking button: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  except Exception as e:
359
- logger.error(f"Browser-based extraction failed: {e}")
 
 
 
360
 
361
  # Filter links to likely contain exam documents
362
  filtered_links = []
363
  for link in links:
364
  # Common file extensions for exam documents
365
- if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
366
  filtered_links.append(link)
367
  continue
368
 
369
  # Common paths for exam documents
370
  if any(pattern in link.lower() for pattern in [
371
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
372
- "/pastpapers/", "/questionpapers/", "/tests/"
 
373
  ]):
374
  filtered_links.append(link)
375
 
@@ -383,6 +860,9 @@ class DownloadManager:
383
  async def extract_downloadable_files(self, url, custom_ext_list):
384
  found_files = []
385
  try:
 
 
 
386
  # Special handling for educational exam sites
387
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
388
  ["exam", "test", "pastpaper", "eduexp"]):
@@ -403,6 +883,12 @@ class DownloadManager:
403
  except Exception:
404
  pass
405
 
 
 
 
 
 
 
406
  # Get file size
407
  size_str = await self.get_file_size(real_url)
408
 
@@ -429,14 +915,55 @@ class DownloadManager:
429
  response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
430
  if not response:
431
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
  final_url = self.page.url
434
  if '.php' in final_url or 'download' in final_url:
435
  real_url = await self.extract_real_download_url(final_url)
436
  if real_url != final_url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  found_files.append({
438
  'url': real_url,
439
- 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
440
  'size': await self.get_file_size(real_url),
441
  'metadata': {}
442
  })
@@ -549,15 +1076,118 @@ class DownloadManager:
549
  'metadata': meta
550
  })
551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  seen_urls = set()
553
  unique_files = []
554
  for f in found_files:
555
  if f['url'] not in seen_urls:
556
  seen_urls.add(f['url'])
557
  unique_files.append(f)
 
558
  return unique_files
559
  except Exception as e:
560
  logger.error(f"Error extracting files from {url}: {e}")
 
561
  return []
562
 
563
  async def download_file(self, file_info, save_dir, referer):
@@ -596,23 +1226,85 @@ class DownloadManager:
596
  logger.warning("All standard methods failed, attempting force download")
597
  result_path = await self.force_download_viewonly(file_info, path)
598
  return result_path if result_path else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
- # Original code for non-Google Drive downloads
601
  async with self.context.new_page() as page:
602
  headers = {
603
  'Accept': '*/*',
604
  'Accept-Encoding': 'gzip, deflate, br',
605
  'Referer': referer
606
  }
607
- response = await page.request.get(file_url, headers=headers, timeout=30000)
608
- if response.status == 200:
609
- content = await response.body()
610
- with open(path, 'wb') as f:
611
- f.write(content)
612
- return path
613
- else:
614
- logger.error(f"Download failed with status {response.status}: {file_url}")
615
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
  except Exception as e:
617
  logger.error(f"Error downloading {file_url}: {e}")
618
  return None
@@ -642,17 +1334,20 @@ class DownloadManager:
642
 
643
  logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
644
 
645
- # Create a dedicated browser instance with better resolution
 
 
 
 
 
 
 
 
 
 
646
  browser = await self.playwright.chromium.launch(
647
  headless=True,
648
- args=[
649
- '--no-sandbox',
650
- '--disable-setuid-sandbox',
651
- '--disable-dev-shm-usage',
652
- '--disable-web-security',
653
- '--disable-features=IsolateOrigins,site-per-process',
654
- '--disable-site-isolation-trials'
655
- ]
656
  )
657
 
658
  # Use higher resolution for better quality
@@ -663,6 +1358,34 @@ class DownloadManager:
663
  accept_downloads=True # Critical for the download workflow
664
  )
665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  page = await context.new_page()
667
 
668
  try:
@@ -670,7 +1393,14 @@ class DownloadManager:
670
  logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
671
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
672
  await page.wait_for_load_state('networkidle')
673
- await page.wait_for_timeout(5000) # Wait longer for everything to load
 
 
 
 
 
 
 
674
 
675
  # Create temp directory
676
  temp_dir = tempfile.mkdtemp()
@@ -679,7 +1409,11 @@ class DownloadManager:
679
  if file_type.lower() == 'pdf':
680
  # Use the improved scrolling and detection approach
681
 
682
- # Check if there's a pagination control to estimate pages
 
 
 
 
683
  estimated_pages = await page.evaluate("""
684
  () => {
685
  // Method 1: Check page counter text
@@ -709,14 +1443,13 @@ class DownloadManager:
709
 
710
  logger.info(f"Estimated {estimated_pages} pages in PDF")
711
 
712
- # Scroll to ensure all pages are loaded
713
- logger.info("Scrolling to load all PDF pages...")
714
-
715
- # Initial scroll to bottom to trigger lazy loading
716
  await page.keyboard.press("End")
717
  await page.wait_for_timeout(3000)
718
 
719
  # Scroll page by page to ensure all pages are loaded
 
720
  max_attempts = min(estimated_pages * 3, 300)
721
  attempt = 0
722
  prev_blob_count = 0
@@ -734,8 +1467,19 @@ class DownloadManager:
734
  logger.info("All pages appear to be loaded.")
735
  break
736
 
737
- await page.keyboard.press("PageDown")
738
- await page.wait_for_timeout(2000)
 
 
 
 
 
 
 
 
 
 
 
739
  prev_blob_count = blob_count
740
  attempt += 1
741
 
@@ -801,6 +1545,72 @@ class DownloadManager:
801
 
802
  if not result.get('success', False):
803
  logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
  return None
805
 
806
  logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
@@ -902,6 +1712,37 @@ class DownloadManager:
902
 
903
  # Try standard approaches for non-view-only files
904
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
  # Try with requests and session cookies
906
  session = requests.Session()
907
  session.headers.update({'User-Agent': get_random_user_agent()})
@@ -944,37 +1785,111 @@ class DownloadManager:
944
  except Exception as e:
945
  logger.warning(f"Requests session download failed: {e}")
946
 
947
- logger.warning("Standard download methods failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
  return False
949
 
950
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
951
  """Download view-only PDF using the enhanced blob image caching technique"""
952
  try:
953
- # Create a dedicated browser instance
 
 
 
 
 
 
 
 
954
  browser = await self.playwright.chromium.launch(
955
  headless=True,
956
- args=[
957
- '--no-sandbox',
958
- '--disable-setuid-sandbox',
959
- '--disable-dev-shm-usage',
960
- '--disable-web-security'
961
- ]
962
  )
963
 
 
964
  context = await browser.new_context(
965
  viewport={'width': 1600, 'height': 1200},
966
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
967
- accept_downloads=True # Critical for handling the download event
 
968
  )
969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  page = await context.new_page()
971
 
972
  try:
973
- # Step 1: Navigate to the file
974
  logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
975
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
976
  await page.wait_for_load_state('networkidle')
977
- await page.wait_for_timeout(5000) # Initial wait for content to load
 
 
 
978
 
979
  # Step 2: Estimate the number of pages
980
  estimated_pages = await page.evaluate("""
@@ -1007,11 +1922,12 @@ class DownloadManager:
1007
  await page.keyboard.press("End")
1008
  await page.wait_for_timeout(3000)
1009
 
1010
- # Step 4: Wait for all pages to load by pressing PageDown and checking blob images
1011
- logger.info("Waiting for all pages to load...")
1012
- max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
1013
  attempt = 0
1014
  prev_blob_count = 0
 
1015
 
1016
  while attempt < max_attempts:
1017
  # Count blob images (which are the PDF pages)
@@ -1023,14 +1939,40 @@ class DownloadManager:
1023
 
1024
  logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
1025
 
1026
- # If we've loaded enough pages or reached estimated count
1027
- if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
1028
- logger.info("All pages appear to be loaded.")
1029
  break
1030
 
1031
- # Press PageDown to scroll further and trigger more loading
1032
- await page.keyboard.press("PageDown")
1033
- await page.wait_for_timeout(2000) # Wait for content to load
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1034
  prev_blob_count = blob_count
1035
  attempt += 1
1036
 
@@ -1050,10 +1992,9 @@ class DownloadManager:
1050
  try {
1051
  let pdf = new jsPDF();
1052
  let imgs = document.getElementsByTagName("img");
1053
- let added = 0;
1054
-
1055
- // First collect and sort all valid blob images
1056
  let validImages = [];
 
 
1057
  for (let i = 0; i < imgs.length; i++) {
1058
  let img = imgs[i];
1059
  if (!/^blob:/.test(img.src)) continue;
@@ -1061,7 +2002,7 @@ class DownloadManager:
1061
  validImages.push(img);
1062
  }
1063
 
1064
- // Sort by vertical position
1065
  validImages.sort((a, b) => {
1066
  const rectA = a.getBoundingClientRect();
1067
  const rectB = b.getBoundingClientRect();
@@ -1070,6 +2011,7 @@ class DownloadManager:
1070
 
1071
  console.log(`Found ${validImages.length} valid page images to add to PDF`);
1072
 
 
1073
  // Process each image as a page
1074
  for (let i = 0; i < validImages.length; i++) {
1075
  let img = validImages[i];
@@ -1384,6 +2326,9 @@ class DownloadManager:
1384
  logger.info(f"Found {len(links)} sublinks with specialized method")
1385
  return list(links)[:limit]
1386
 
 
 
 
1387
  # Standard sublink extraction for all sites
1388
  await self.page.goto(url, timeout=30000, wait_until='networkidle')
1389
 
@@ -1392,6 +2337,23 @@ class DownloadManager:
1392
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1393
  path_base = os.path.dirname(parsed_base.path)
1394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1395
  # Check if page has ASP.NET elements which might need special handling
1396
  is_aspnet = await self.page.evaluate('''
1397
  () => {
@@ -1514,6 +2476,60 @@ class DownloadManager:
1514
  except Exception as e:
1515
  logger.warning(f"Error with postback: {e}")
1516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1517
  logger.info(f"Found {len(links)} sublinks")
1518
  return list(links)[:limit]
1519
 
@@ -1578,6 +2594,19 @@ class DownloadManager:
1578
  links_set.add(full_url)
1579
  except Exception:
1580
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
1581
 
1582
  def resolve_relative_url(self, relative_url, base_url, path_base):
1583
  """Properly resolve relative URLs considering multiple formats"""
@@ -1628,12 +2657,14 @@ class DownloadManager:
1628
  total_links = len(sublinks)
1629
  progress_text.text(f"Found {total_links} sublinks to process")
1630
 
 
 
 
1631
  if not sublinks:
1632
  progress_bar.progress(1.0)
1633
- return main_files
1634
 
1635
  # Process each sublink
1636
- all_files = main_files
1637
  for i, sublink in enumerate(sublinks, 1):
1638
  progress = i / total_links
1639
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
@@ -1703,6 +2734,7 @@ def main():
1703
  sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
1704
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
1705
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
 
1706
 
1707
  with st.expander("Google Drive Integration", expanded=False):
1708
  if st.button("Start Google Sign-In", key="google_signin_btn"):
@@ -1713,6 +2745,37 @@ def main():
1713
  creds, msg = exchange_code_for_credentials(auth_code)
1714
  st.session_state.google_creds = creds
1715
  st.write(msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1716
 
1717
  if mode == "Manual URL":
1718
  st.header("Manual URL Mode")
@@ -1727,16 +2790,20 @@ def main():
1727
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
1728
 
1729
  @st.cache_resource
1730
- def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
1731
  async def _run():
1732
- async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
 
 
 
 
1733
  files = await dm.deep_search(url, ext_list, max_links, timeout_val)
1734
  return files
1735
  return asyncio.run(_run())
1736
 
1737
  with st.spinner("Searching for files..."):
1738
  files = run_deep_search(url, valid_ext_list, max_sublinks,
1739
- sublink_timeout, use_proxy, proxy)
1740
 
1741
  if files:
1742
  st.session_state.discovered_files = files
@@ -1799,7 +2866,11 @@ def main():
1799
  progress_bar = st.progress(0)
1800
  status_text = st.empty()
1801
 
1802
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
 
 
 
 
1803
  for i, idx in enumerate(selected_indices):
1804
  progress = (i + 1) / len(selected_indices)
1805
  file_info = files[idx]
@@ -1880,7 +2951,13 @@ def main():
1880
  if st.button("Search", key="search_btn"):
1881
  if query:
1882
  async def run_search():
1883
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
 
 
 
 
 
 
1884
  with st.spinner("Searching..."):
1885
  urls = await dm.search_bing()
1886
  if urls:
@@ -1911,16 +2988,20 @@ def main():
1911
  valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
1912
 
1913
  @st.cache_resource
1914
- def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
1915
  async def _run():
1916
- async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
 
 
 
 
1917
  files = await dm.deep_search(url, ext_list, max_links, timeout_val)
1918
  return files
1919
  return asyncio.run(_run())
1920
 
1921
  with st.spinner("Searching for files..."):
1922
  files = run_deep_search(url, valid_ext_list, max_sublinks,
1923
- sublink_timeout, use_proxy, proxy)
1924
 
1925
  if files:
1926
  st.session_state.discovered_files = files
@@ -1944,7 +3025,7 @@ def main():
1944
 
1945
  with st.spinner("Downloading view-only document... (this may take a minute)"):
1946
  async def download_viewonly():
1947
- async with DownloadManager() as dm:
1948
  file_info = {
1949
  'url': f"https://drive.google.com/file/d/{file_id}/view",
1950
  'filename': f"gdrive_{file_id}.pdf",
@@ -1957,13 +3038,15 @@ def main():
1957
 
1958
  if result:
1959
  st.success("Document downloaded successfully!")
 
 
1960
  with open(result, "rb") as f:
1961
  file_bytes = f.read()
1962
-
1963
  st.download_button(
1964
  label="Download PDF",
1965
  data=file_bytes,
1966
- file_name=os.path.basename(result),
1967
  mime="application/pdf"
1968
  )
1969
  else:
@@ -1971,7 +3054,7 @@ def main():
1971
 
1972
  # Add footer with attribution
1973
  st.markdown('---')
1974
- st.markdown('Created by [Euler314](https://github.com/yu314-coder)')
1975
 
1976
  if __name__ == "__main__":
1977
  main()
 
22
  import traceback
23
  import base64
24
  import shutil
25
+ import json
26
+ import time
27
  from PIL import Image
28
  from reportlab.lib.pagesizes import letter
29
  from reportlab.pdfgen import canvas
 
52
  }
53
  }
54
 
55
+ # -------------------- Stealth and UA Settings --------------------
56
+ # Extended user agent list for better variety
57
+ USER_AGENTS = [
58
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
59
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
60
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
61
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
62
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
63
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
64
+ 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
65
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
66
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
67
+ ]
68
+
69
+ # Stealth browser settings
70
+ STEALTH_SETTINGS = {
71
+ # Hardware features to modify/disable
72
+ "hardware_concurrency": 4,
73
+ "device_memory": 8,
74
+ # Browser features to enable/disable
75
+ "webgl_vendor": "Google Inc. (Intel)",
76
+ "webgl_renderer": "Intel Iris OpenGL Engine",
77
+ "languages": ["en-US", "en"],
78
+ "disable_webrtc": True,
79
+ # Additional timing randomization
80
+ "navigator_platform": "Win32",
81
+ "touch_support": False
82
+ }
83
+
84
+ # Proxy rotation configuration (if using multiple proxies)
85
+ PROXY_ROTATION_CONFIG = {
86
+ "enabled": False, # Set to True to enable rotation
87
+ "rotation_interval": 10, # Rotate every 10 requests
88
+ "proxies": [] # Will be populated from the UI if needed
89
+ }
90
+
91
  # -------------------- Utility Functions --------------------
92
  def get_random_user_agent():
 
 
 
 
 
 
93
  return random.choice(USER_AGENTS)
94
 
95
  def sizeof_fmt(num, suffix='B'):
 
107
  zipf.write(file_path, os.path.basename(file_path))
108
  return zip_path
109
 
110
+ def get_file_extension(url, default='.pdf'):
111
+ """Extract file extension from URL or filename"""
112
+ path = urlparse(url).path
113
+ ext = os.path.splitext(path)[1].lower()
114
+ if not ext:
115
+ return default
116
+ return ext
117
+
118
+ def humanize_file_size(size_bytes):
119
+ """Format file size in human-readable format"""
120
+ if size_bytes < 1024:
121
+ return f"{size_bytes} bytes"
122
+ for unit in ['KB', 'MB', 'GB', 'TB']:
123
+ size_bytes /= 1024.0
124
+ if size_bytes < 1024.0:
125
+ return f"{size_bytes:.1f} {unit}"
126
+ return f"{size_bytes:.1f} PB"
127
+
128
+ def get_domain(url):
129
+ """Extract domain from URL"""
130
+ parsed = urlparse(url)
131
+ return parsed.netloc
132
+
133
+ def is_valid_file_url(url, extensions):
134
+ """Check if URL is a valid file URL based on extension"""
135
+ return any(url.lower().endswith(ext) for ext in extensions)
136
+
137
+ def detect_captcha(html_content):
138
+ """Detect common captcha patterns in HTML content"""
139
+ captcha_patterns = [
140
+ 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
141
+ 'challenge', 'solve the following', 'verify you are human'
142
+ ]
143
+ html_lower = html_content.lower()
144
+ return any(pattern in html_lower for pattern in captcha_patterns)
145
+
146
  # -------------------- Google Drive Functions --------------------
147
  def get_google_auth_url():
148
  client_config = GOOGLE_OAUTH_CONFIG["web"]
 
221
 
222
  # -------------------- Download Manager Class --------------------
223
  class DownloadManager:
224
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False):
225
  self.use_proxy = use_proxy
226
  self.proxy = proxy
227
  self.query = query
 
230
  self.browser = None
231
  self.context = None
232
  self.page = None
233
+ self.use_stealth = use_stealth
234
+ self.proxy_rotation = proxy_rotation
235
+ self.request_count = 0
236
+ self.captcha_detected = False
237
+ self.download_timeout = 300 # 5 minutes timeout for downloads
238
 
239
  async def __aenter__(self):
240
  self.playwright = await async_playwright().start()
241
+
242
+ # Prepare browser args with stealth settings
243
+ browser_args = [
244
+ '--no-sandbox',
245
+ '--disable-setuid-sandbox',
246
+ '--disable-dev-shm-usage',
247
+ '--disable-gpu',
248
+ '--no-zygote',
249
+ '--single-process',
250
+ '--disable-web-security',
251
+ '--disable-features=IsolateOrigins',
252
+ '--disable-site-isolation-trials'
253
+ ]
254
+
255
+ # Add stealth-specific args
256
+ if self.use_stealth:
257
+ browser_args.extend([
258
+ '--disable-blink-features=AutomationControlled',
259
+ '--disable-features=IsolateOrigins,site-per-process',
260
+ '--disable-webgl',
261
+ '--disable-webrtc'
262
+ ])
263
+
264
+ # Setup browser options
265
  opts = {
266
  "headless": True,
267
+ "args": browser_args
 
 
 
 
 
 
 
268
  }
269
+
270
+ # Configure proxy if specified
271
  if self.use_proxy and self.proxy:
272
  opts["proxy"] = {"server": self.proxy}
273
+
274
+ # Launch browser with options
275
  self.browser = await self.playwright.chromium.launch(**opts)
276
+
277
+ # Setup browser context with enhanced settings
278
+ context_opts = {
279
+ "user_agent": get_random_user_agent(),
280
+ "viewport": {"width": 1920, "height": 1080},
281
+ "device_scale_factor": 1,
282
+ "has_touch": False,
283
+ "is_mobile": False,
284
+ "ignore_https_errors": True,
285
+ "accept_downloads": True
286
+ }
287
+
288
+ # Apply stealth-specific settings to the context
289
+ if self.use_stealth:
290
+ # Apply JS-injection for enhanced stealth
291
+ context_opts["bypass_csp"] = True
292
+ self.context = await self.browser.new_context(**context_opts)
293
+
294
+ # Execute stealth JS to avoid detection
295
+ await self.context.add_init_script("""
296
+ () => {
297
+ Object.defineProperty(navigator, 'webdriver', {
298
+ get: () => false,
299
+ });
300
+
301
+ // Change navigator properties
302
+ const newProto = navigator.__proto__;
303
+ delete newProto.webdriver;
304
+
305
+ // Overwrite the plugins
306
+ Object.defineProperty(navigator, 'plugins', {
307
+ get: () => [1, 2, 3, 4, 5].map(() => ({
308
+ lengthComputable: true,
309
+ loaded: 100,
310
+ total: 100
311
+ }))
312
+ });
313
+
314
+ // Handle languages more naturally
315
+ Object.defineProperty(navigator, 'languages', {
316
+ get: () => ['en-US', 'en', 'es']
317
+ });
318
+
319
+ // Modify hardware concurrency
320
+ Object.defineProperty(navigator, 'hardwareConcurrency', {
321
+ get: () => 4
322
+ });
323
+
324
+ // Modify deviceMemory
325
+ Object.defineProperty(navigator, 'deviceMemory', {
326
+ get: () => 8
327
+ });
328
+
329
+ // WebGL modifications
330
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
331
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
332
+ if (parameter === 37445) {
333
+ return 'Intel Inc.';
334
+ }
335
+ if (parameter === 37446) {
336
+ return 'Intel Iris OpenGL Engine';
337
+ }
338
+ return getParameter.apply(this, arguments);
339
+ };
340
+ }
341
+ """)
342
+ else:
343
+ # Regular context without stealth
344
+ self.context = await self.browser.new_context(**context_opts)
345
+
346
+ # Create page with enhanced headers
347
  self.page = await self.context.new_page()
348
  await self.page.set_extra_http_headers({
349
+ 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
350
  'Accept-Encoding': 'gzip, deflate, br',
351
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
352
+ 'Cache-Control': 'max-age=0',
353
+ 'DNT': '1', # Do Not Track
354
+ 'Referer': 'https://www.google.com/',
355
+ 'Sec-Fetch-Dest': 'document',
356
+ 'Sec-Fetch-Mode': 'navigate',
357
+ 'Sec-Fetch-Site': 'cross-site',
358
+ 'Sec-Fetch-User': '?1',
359
+ 'Upgrade-Insecure-Requests': '1'
360
  })
361
+
362
+ # Add delay for mouse movements to simulate human behavior
363
+ if self.use_stealth:
364
+ await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500))
365
+ await self.page.wait_for_timeout(random.randint(200, 500))
366
+
367
  return self
368
 
369
  async def __aexit__(self, exc_type, exc_val, exc_tb):
 
372
  if self.playwright:
373
  await self.playwright.stop()
374
 
375
+ async def rotate_proxy_if_needed(self):
376
+ """Rotate proxy if proxy rotation is enabled and threshold is reached"""
377
+ if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]:
378
+ self.request_count += 1
379
+ if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]:
380
+ # Get next proxy from the pool
381
+ next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0)
382
+ PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list
383
+
384
+ # Close existing context and create new one with the new proxy
385
+ if self.context:
386
+ await self.context.close()
387
+
388
+ # Create new context with the new proxy
389
+ context_opts = {
390
+ "user_agent": get_random_user_agent(),
391
+ "proxy": {"server": next_proxy},
392
+ "accept_downloads": True
393
+ }
394
+ self.context = await self.browser.new_context(**context_opts)
395
+ self.page = await self.context.new_page()
396
+
397
+ # Reset counter
398
+ self.request_count = 0
399
+ logger.info(f"Rotated to new proxy: {next_proxy}")
400
+
401
+ async def handle_captcha(self, page):
402
+ """Detect and handle captchas if possible"""
403
+ # Check for common captcha patterns
404
+ content = await page.content()
405
+ if detect_captcha(content):
406
+ self.captcha_detected = True
407
+ logger.warning("Captcha detected on page")
408
+
409
+ # Strategies for handling captchas:
410
+ # 1. For simple captchas, try to extract the image and solve it
411
+ captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]')
412
+ if captcha_img:
413
+ logger.info("Found captcha image, attempting to capture")
414
+
415
+ # Take screenshot of the captcha
416
+ captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png")
417
+ await captcha_img.screenshot(path=captcha_path)
418
+
419
+ # In a real implementation, you would send this to a captcha solving service
420
+ # For now, just log the detection
421
+ logger.info(f"Captcha image saved to {captcha_path}")
422
+
423
+ # For demonstration, we'll notify the user but not actually solve it
424
+ return False
425
+
426
+ # 2. For reCAPTCHA, special handling would be required
427
+ recaptcha = await page.query_selector('iframe[src*="recaptcha"]')
428
+ if recaptcha:
429
+ logger.warning("reCAPTCHA detected, would require external solving service")
430
+ return False
431
+
432
+ # 3. Try to perform human-like actions that might bypass simple bot checks
433
+ await self.perform_human_actions(page)
434
+
435
+ # Check if captcha is still present
436
+ content = await page.content()
437
+ if detect_captcha(content):
438
+ logger.warning("Captcha still present after human-like actions")
439
+ return False
440
+ else:
441
+ logger.info("Captcha appears to be resolved")
442
+ return True
443
+
444
+ return True # No captcha detected
445
+
446
+ async def perform_human_actions(self, page):
447
+ """Perform human-like actions on the page to possibly bypass simple bot checks"""
448
+ try:
449
+ # 1. Slowly scroll down the page
450
+ for i in range(3):
451
+ await page.evaluate(f"window.scrollTo(0, {i * 300})")
452
+ await page.wait_for_timeout(random.randint(300, 700))
453
+
454
+ # 2. Random mouse movements
455
+ for _ in range(3):
456
+ x = random.randint(100, 800)
457
+ y = random.randint(100, 600)
458
+ await page.mouse.move(x=x, y=y)
459
+ await page.wait_for_timeout(random.randint(200, 500))
460
+
461
+ # 3. Click on a non-essential part of the page
462
+ try:
463
+ await page.click("body", position={"x": 50, "y": 50})
464
+ except:
465
+ pass
466
+
467
+ # 4. Wait a bit before continuing
468
+ await page.wait_for_timeout(1000)
469
+
470
+ except Exception as e:
471
+ logger.warning(f"Error during human-like actions: {e}")
472
+
473
  async def search_bing(self):
474
  urls = []
475
  try:
476
+ # Rotate proxy if needed
477
+ await self.rotate_proxy_if_needed()
478
+
479
  search_url = f"https://www.bing.com/search?q={self.query}"
480
  await self.page.goto(search_url, timeout=30000)
481
  await self.page.wait_for_load_state('networkidle')
482
+
483
+ # Check for captchas
484
+ if not await self.handle_captcha(self.page):
485
+ logger.warning("Captcha detected during search, results may be limited")
486
+
487
+ # More natural scrolling behavior
488
+ for i in range(3):
489
+ await self.page.evaluate(f"window.scrollTo(0, {i * 400})")
490
+ await self.page.wait_for_timeout(random.randint(300, 800))
491
+
492
+ # Extract search results
493
  links = await self.page.query_selector_all("li.b_algo h2 a")
494
  for link in links[:self.num_results]:
495
  href = await link.get_attribute('href')
496
  if href:
497
  urls.append(href)
498
+
499
+ # If we didn't find enough results, try an alternative selector
500
+ if len(urls) < self.num_results:
501
+ alt_links = await self.page.query_selector_all(".b_caption a")
502
+ for link in alt_links:
503
+ href = await link.get_attribute('href')
504
+ if href and href not in urls:
505
+ urls.append(href)
506
+ if len(urls) >= self.num_results:
507
+ break
508
+
509
  return urls
510
  except Exception as e:
511
  logger.error(f"Error searching Bing: {e}")
 
513
 
514
  async def get_file_size(self, url):
515
  try:
516
+ await self.rotate_proxy_if_needed()
517
+
518
  async with self.context.new_page() as page:
519
  response = await page.request.head(url, timeout=15000)
520
  length = response.headers.get('Content-Length', None)
 
522
  return sizeof_fmt(int(length))
523
  else:
524
  return "Unknown Size"
525
+ except Exception as e:
526
+ logger.warning(f"Error getting file size: {e}")
527
  return "Unknown Size"
528
 
529
  async def get_pdf_metadata(self, url):
530
  try:
531
+ await self.rotate_proxy_if_needed()
532
+
533
  async with self.context.new_page() as page:
534
  resp = await page.request.get(url, timeout=15000)
535
  if resp.ok:
 
543
  }
544
  else:
545
  return {}
546
+ except Exception as e:
547
+ logger.warning(f"Error reading PDF metadata: {e}")
548
  return {}
549
 
550
  async def extract_real_download_url(self, url):
551
  try:
552
+ await self.rotate_proxy_if_needed()
553
+
554
  async with self.context.new_page() as page:
555
  response = await page.goto(url, wait_until='networkidle', timeout=30000)
556
  if response and response.headers.get('location'):
 
567
  logger.info(f"Fetching exam links from {url}")
568
  links = set()
569
 
570
+ # First try with direct requests for speed (but with proper headers)
571
+ headers = {
572
+ "User-Agent": get_random_user_agent(),
573
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
574
+ "Accept-Language": "en-US,en;q=0.9",
575
+ "Referer": "https://www.google.com/",
576
+ "DNT": "1"
577
+ }
578
+
579
  try:
580
  response = requests.get(url, headers=headers, timeout=30)
581
 
 
590
  href = a["href"]
591
  full_url = urljoin(url, href)
592
 
593
+ # Look for text clues
594
+ link_text = a.get_text().lower()
595
+
596
+ # Special patterns for exam sites (expanded list)
597
+ url_patterns = [
598
+ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
599
+ "/test/", "/download/", "/files/", "/assignments/",
600
+ "paper_", "question_", "exam_", "test_", "past_",
601
+ "assignment_", "sample_", "study_material", "notes_",
602
+ "/resource/", "/subject/", "/course/", "/material/"
603
+ ]
604
+
605
+ text_patterns = [
606
+ "exam", "paper", "test", "question", "past", "download",
607
+ "assignment", "sample", "study", "material", "notes",
608
+ "subject", "course", "resource", "pdf", "document",
609
+ "view", "open", "get", "solution", "answer"
610
+ ]
611
+
612
+ # Check URL for patterns
613
+ if any(pattern in full_url.lower() for pattern in url_patterns):
614
+ links.add(full_url)
615
+ continue
616
+
617
+ # Check link text for patterns
618
+ if any(pattern in link_text for pattern in text_patterns):
619
+ links.add(full_url)
620
+ continue
621
+
622
+ # Check for common file extensions
623
+ if any(full_url.lower().endswith(ext) for ext in
624
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
625
+ links.add(full_url)
626
  except Exception as e:
627
  logger.warning(f"Request-based extraction failed: {e}")
628
 
629
+ # Browser-based approach for more thorough extraction or if initial approach was inadequate
630
+ try:
631
+ # Check if we need to proceed with browser-based extraction
632
+ if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
633
+ logger.info("Using browser for enhanced link extraction")
634
+
635
+ # Rotate proxy if needed
636
+ await self.rotate_proxy_if_needed()
637
 
638
+ # Navigate to the page with more natural timing
639
+ await self.page.goto(url, timeout=45000, wait_until='networkidle')
640
+ await self.page.wait_for_timeout(random.randint(1000, 2000))
641
+
642
+ # Handle captchas if present
643
+ if not await self.handle_captcha(self.page):
644
+ logger.warning("Captcha detected, extraction may be limited")
645
+
646
+ # Get base URL for resolving relative links
647
+ parsed_base = urlparse(url)
648
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
649
+
650
+ # Perform natural scrolling to trigger lazy-loaded content
651
+ page_height = await self.page.evaluate("document.body.scrollHeight")
652
+ viewport_height = await self.page.evaluate("window.innerHeight")
653
+
654
+ for scroll_pos in range(0, page_height, viewport_height // 2):
655
+ await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})")
656
+ await self.page.wait_for_timeout(random.randint(300, 800))
657
+
658
+ # Scroll back to top
659
+ await self.page.evaluate("window.scrollTo(0, 0)")
660
+ await self.page.wait_for_timeout(500)
661
+
662
+ # Extract all links with Playwright (better than just anchor tags)
663
+ all_links = await self.page.evaluate("""
664
  () => {
665
+ const results = [];
666
+
667
+ // Get all anchor tags
668
  const anchors = document.querySelectorAll('a[href]');
669
  for (const a of anchors) {
670
  if (a.href) {
671
+ results.push({
672
  href: a.href,
673
+ text: a.innerText || a.textContent || '',
674
+ isButton: a.classList.contains('btn') || a.role === 'button'
675
  });
676
  }
677
  }
678
+
679
+ // Get buttons that might contain links
680
+ const buttons = document.querySelectorAll('button');
681
+ for (const btn of buttons) {
682
+ const onclick = btn.getAttribute('onclick') || '';
683
+ if (onclick.includes('window.location') || onclick.includes('download')) {
684
+ results.push({
685
+ href: '#button',
686
+ text: btn.innerText || btn.textContent || '',
687
+ isButton: true,
688
+ onclick: onclick
689
+ });
690
+ }
691
+ }
692
+
693
+ return results;
694
  }
695
  """)
696
 
697
+ # Process the extracted links
698
+ for link_info in all_links:
699
  href = link_info.get('href', '')
700
  text = link_info.get('text', '').lower()
701
 
702
+ if href and href != '#button':
703
+ # Check URL patterns
704
+ url_patterns = [
705
+ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
706
+ "/test/", "/download/", "/files/", "/assignments/",
707
+ "paper_", "question_", "exam_", "test_", "past_",
708
+ "assignment_", "sample_", "study_material", "notes_"
709
+ ]
710
 
711
+ # Check text patterns
712
+ text_patterns = [
713
+ "exam", "paper", "test", "question", "past", "download",
714
+ "assignment", "sample", "study", "material", "notes",
715
+ "pdf", "document", "view", "open", "solution"
716
+ ]
717
 
718
  if any(pattern in href.lower() for pattern in url_patterns) or \
719
+ any(pattern in text for pattern in text_patterns) or \
720
+ any(href.lower().endswith(ext) for ext in
721
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
722
  links.add(href)
723
 
724
  # Check for ASP.NET specific elements that might contain exam links
725
+ grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
726
  for grid in grid_elements:
727
  grid_links = await grid.query_selector_all('a[href]')
728
  for a in grid_links:
729
  href = await a.get_attribute('href')
730
+ text = await a.text_content()
731
+
732
  if href:
733
  full_url = href if href.startswith('http') else urljoin(url, href)
734
  links.add(full_url)
735
 
736
+ # Try clicking pagination controls to reveal more content
737
+ pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a')
738
+ for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons
739
+ try:
740
+ # Check if this is a numeric pagination button (more likely to be useful)
741
+ button_text = await button.text_content()
742
+ if button_text and button_text.strip().isdigit():
743
+ logger.info(f"Clicking pagination button: {button_text}")
744
+ await button.click()
745
+ await self.page.wait_for_timeout(2000)
746
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
747
+
748
+ # Extract links from this page
749
+ new_page_links = await self.page.evaluate("""
750
+ () => {
751
+ return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
752
+ }
753
+ """)
754
+
755
+ for href in new_page_links:
756
+ if href and not href.startswith('javascript:'):
757
+ if any(pattern in href.lower() for pattern in url_patterns) or \
758
+ any(href.lower().endswith(ext) for ext in
759
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
760
+ links.add(href)
761
+ except Exception as e:
762
+ logger.warning(f"Error clicking pagination button: {e}")
763
+
764
+ # Try clicking any controls that might reveal more exam links (more focused approach)
765
+ show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn')
766
+ for button in show_buttons:
767
+ button_text = (await button.text_content() or "").lower()
768
+ button_value = (await button.get_attribute("value") or "").lower()
769
+ button_id = (await button.get_attribute("id") or "").lower()
770
+
771
+ # Look for buttons that seem likely to reveal file lists
772
+ promising_terms = ["show", "view", "display", "list", "exam", "paper", "test",
773
+ "download", "resource", "material", "browse", "file"]
774
+
775
+ if any(term in button_text or term in button_value or term in button_id
776
+ for term in promising_terms):
777
  try:
778
+ logger.info(f"Clicking button: {button_text or button_value}")
779
  await button.click()
780
+ await self.page.wait_for_timeout(2000)
781
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
782
 
783
  # Get any new links that appeared
784
  new_links = await self.page.query_selector_all('a[href]')
 
786
  href = await a.get_attribute('href')
787
  if href:
788
  full_url = href if href.startswith('http') else urljoin(url, href)
789
+
790
+ # Focus on file extensions and patterns
791
+ if any(full_url.lower().endswith(ext) for ext in
792
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \
793
+ any(pattern in full_url.lower() for pattern in url_patterns):
794
+ links.add(full_url)
795
  except Exception as e:
796
  logger.warning(f"Error clicking button: {e}")
797
+
798
+ # Special handling for ASP.NET PostBack links
799
+ try:
800
+ # Find and interact with ASP.NET __doPostBack elements
801
+ postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]')
802
+ for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks
803
+ try:
804
+ onclick = await element.get_attribute('onclick')
805
+ if onclick and '__doPostBack' in onclick:
806
+ element_text = await element.text_content()
807
+
808
+ # Only interact with elements that seem likely to contain exam links
809
+ promising_terms = ["show", "view", "list", "exam", "paper", "test",
810
+ "download", "resource", "material"]
811
+
812
+ if any(term in element_text.lower() for term in promising_terms):
813
+ logger.info(f"Clicking ASP.NET postback element: {element_text}")
814
+
815
+ # Click the element
816
+ await element.click()
817
+ await self.page.wait_for_timeout(2000)
818
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
819
+
820
+ # Extract any new links
821
+ new_links = await self.page.query_selector_all('a[href]')
822
+ for a in new_links:
823
+ href = await a.get_attribute('href')
824
+ if href:
825
+ full_url = href if href.startswith('http') else urljoin(url, href)
826
+ if any(full_url.lower().endswith(ext) for ext in
827
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
828
+ links.add(full_url)
829
+ except Exception as e:
830
+ logger.warning(f"Error interacting with postback element: {e}")
831
  except Exception as e:
832
+ logger.warning(f"Error during postback handling: {e}")
833
+
834
+ except Exception as e:
835
+ logger.error(f"Browser-based extraction failed: {e}")
836
 
837
  # Filter links to likely contain exam documents
838
  filtered_links = []
839
  for link in links:
840
  # Common file extensions for exam documents
841
+ if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
842
  filtered_links.append(link)
843
  continue
844
 
845
  # Common paths for exam documents
846
  if any(pattern in link.lower() for pattern in [
847
  "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
848
+ "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
849
+ "/resource/", "/material/", "/notes/", "/subjectmaterial/"
850
  ]):
851
  filtered_links.append(link)
852
 
 
860
  async def extract_downloadable_files(self, url, custom_ext_list):
861
  found_files = []
862
  try:
863
+ # Rotate proxy if needed
864
+ await self.rotate_proxy_if_needed()
865
+
866
  # Special handling for educational exam sites
867
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
868
  ["exam", "test", "pastpaper", "eduexp"]):
 
883
  except Exception:
884
  pass
885
 
886
+ # If filename is empty or invalid, create a sensible one
887
+ if not filename or filename == '/':
888
+ domain = get_domain(real_url)
889
+ ext = get_file_extension(real_url, '.pdf')
890
+ filename = f"file_from_{domain}{ext}"
891
+
892
  # Get file size
893
  size_str = await self.get_file_size(real_url)
894
 
 
915
  response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
916
  if not response:
917
  return []
918
+
919
+ # Check for captchas
920
+ if not await self.handle_captcha(self.page):
921
+ logger.warning("Captcha detected, file extraction may be limited")
922
+
923
+ # Scroll through the page naturally to trigger lazy loading
924
+ await self.page.evaluate("""
925
+ (async () => {
926
+ const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
927
+ const height = document.body.scrollHeight;
928
+ const scrollStep = Math.floor(window.innerHeight / 2);
929
+
930
+ for (let i = 0; i < height; i += scrollStep) {
931
+ window.scrollTo(0, i);
932
+ await delay(100);
933
+ }
934
+
935
+ window.scrollTo(0, 0);
936
+ })()
937
+ """)
938
+ await self.page.wait_for_timeout(1000)
939
 
940
  final_url = self.page.url
941
  if '.php' in final_url or 'download' in final_url:
942
  real_url = await self.extract_real_download_url(final_url)
943
  if real_url != final_url:
944
+ # Try to detect the filename from headers or URL
945
+ response = await self.page.request.head(real_url, timeout=15000)
946
+ filename = None
947
+
948
+ # Try to get from Content-Disposition header
949
+ content_disposition = response.headers.get('Content-Disposition', '')
950
+ if 'filename=' in content_disposition:
951
+ filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition)
952
+ if filename_match:
953
+ filename = filename_match.group(1)
954
+
955
+ # If not found in headers, use URL basename
956
+ if not filename:
957
+ filename = os.path.basename(urlparse(real_url).path)
958
+ if not filename or filename == '/':
959
+ # Generate a name based on domain
960
+ domain = get_domain(real_url)
961
+ ext = get_file_extension(real_url, '.pdf')
962
+ filename = f"file_from_{domain}{ext}"
963
+
964
  found_files.append({
965
  'url': real_url,
966
+ 'filename': filename,
967
  'size': await self.get_file_size(real_url),
968
  'metadata': {}
969
  })
 
1076
  'metadata': meta
1077
  })
1078
 
1079
+ # Also check for data-src and data-url attributes (common in lazy-loaded sites)
1080
+ data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]')
1081
+ for elem in data_elements:
1082
+ for attr in ['data-src', 'data-url', 'data-href', 'data-download']:
1083
+ try:
1084
+ value = await elem.get_attribute(attr)
1085
+ if value and any(value.lower().endswith(ext) for ext in all_exts):
1086
+ file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
1087
+ found_files.append({
1088
+ 'url': file_url,
1089
+ 'filename': os.path.basename(file_url.split('?')[0]),
1090
+ 'size': await self.get_file_size(file_url),
1091
+ 'metadata': {}
1092
+ })
1093
+ except:
1094
+ pass
1095
+
1096
+ # Check script tags for JSON data that might contain file URLs
1097
+ script_elements = soup.find_all('script', type='application/json')
1098
+ for script in script_elements:
1099
+ try:
1100
+ json_data = json.loads(script.string)
1101
+ # Look for URL patterns in the JSON data
1102
+ def extract_urls_from_json(obj, urls_found=None):
1103
+ if urls_found is None:
1104
+ urls_found = []
1105
+ if isinstance(obj, dict):
1106
+ for k, v in obj.items():
1107
+ # Check if any key contains url-like terms
1108
+ url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download']
1109
+ if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'):
1110
+ urls_found.append(v)
1111
+ else:
1112
+ extract_urls_from_json(v, urls_found)
1113
+ elif isinstance(obj, list):
1114
+ for item in obj:
1115
+ extract_urls_from_json(item, urls_found)
1116
+ return urls_found
1117
+
1118
+ json_urls = extract_urls_from_json(json_data)
1119
+ for json_url in json_urls:
1120
+ if any(json_url.lower().endswith(ext) for ext in all_exts):
1121
+ found_files.append({
1122
+ 'url': json_url,
1123
+ 'filename': os.path.basename(json_url.split('?')[0]),
1124
+ 'size': await self.get_file_size(json_url),
1125
+ 'metadata': {}
1126
+ })
1127
+ except:
1128
+ pass
1129
+
1130
+ # Check for hidden download buttons or forms
1131
+ hidden_elements = await self.page.evaluate("""
1132
+ () => {
1133
+ const results = [];
1134
+
1135
+ // Check for hidden forms with download actions
1136
+ const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]');
1137
+ for (const form of forms) {
1138
+ const action = form.getAttribute('action') || '';
1139
+ results.push({
1140
+ type: 'form',
1141
+ action: action,
1142
+ inputs: Array.from(form.querySelectorAll('input[name]')).map(input => {
1143
+ return {name: input.name, value: input.value};
1144
+ })
1145
+ });
1146
+ }
1147
+
1148
+ // Check for hidden download links/buttons
1149
+ const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => {
1150
+ const style = window.getComputedStyle(a);
1151
+ return (style.display === 'none' || style.visibility === 'hidden') &&
1152
+ (a.href.includes('download') || a.href.includes('file'));
1153
+ });
1154
+
1155
+ for (const link of hiddenLinks) {
1156
+ results.push({
1157
+ type: 'link',
1158
+ href: link.href,
1159
+ text: link.innerText || link.textContent
1160
+ });
1161
+ }
1162
+
1163
+ return results;
1164
+ }
1165
+ """)
1166
+
1167
+ # Process hidden elements
1168
+ for elem in hidden_elements:
1169
+ if elem['type'] == 'link' and 'href' in elem:
1170
+ href = elem['href']
1171
+ if any(href.lower().endswith(ext) for ext in all_exts):
1172
+ found_files.append({
1173
+ 'url': href,
1174
+ 'filename': os.path.basename(href.split('?')[0]),
1175
+ 'size': await self.get_file_size(href),
1176
+ 'metadata': {}
1177
+ })
1178
+
1179
+ # Deduplicate files by URL
1180
  seen_urls = set()
1181
  unique_files = []
1182
  for f in found_files:
1183
  if f['url'] not in seen_urls:
1184
  seen_urls.add(f['url'])
1185
  unique_files.append(f)
1186
+
1187
  return unique_files
1188
  except Exception as e:
1189
  logger.error(f"Error extracting files from {url}: {e}")
1190
+ traceback.print_exc()
1191
  return []
1192
 
1193
  async def download_file(self, file_info, save_dir, referer):
 
1226
  logger.warning("All standard methods failed, attempting force download")
1227
  result_path = await self.force_download_viewonly(file_info, path)
1228
  return result_path if result_path else None
1229
+
1230
+ # Rotate proxy if needed
1231
+ await self.rotate_proxy_if_needed()
1232
+
1233
+ # Try with direct requests first (faster)
1234
+ try:
1235
+ headers = {
1236
+ 'User-Agent': get_random_user_agent(),
1237
+ 'Accept': '*/*',
1238
+ 'Accept-Encoding': 'gzip, deflate, br',
1239
+ 'Referer': referer,
1240
+ 'DNT': '1'
1241
+ }
1242
+
1243
+ with requests.get(file_url, headers=headers, stream=True, timeout=30) as response:
1244
+ if response.status_code == 200:
1245
+ # Check content type to verify it's not HTML/error page
1246
+ content_type = response.headers.get('Content-Type', '')
1247
+ if 'text/html' in content_type and not file_url.endswith('.html'):
1248
+ logger.warning(f"Received HTML instead of expected file: {file_url}")
1249
+ else:
1250
+ with open(path, 'wb') as f:
1251
+ for chunk in response.iter_content(chunk_size=8192):
1252
+ if chunk:
1253
+ f.write(chunk)
1254
+
1255
+ # Verify file was downloaded correctly
1256
+ if os.path.exists(path) and os.path.getsize(path) > 0:
1257
+ return path
1258
+ except Exception as e:
1259
+ logger.warning(f"Direct download failed: {e}, trying browser approach")
1260
 
1261
+ # Original code for non-Google Drive downloads using Playwright
1262
  async with self.context.new_page() as page:
1263
  headers = {
1264
  'Accept': '*/*',
1265
  'Accept-Encoding': 'gzip, deflate, br',
1266
  'Referer': referer
1267
  }
1268
+
1269
+ # Try to download with timeout protection
1270
+ try:
1271
+ response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000)
1272
+ if response.status == 200:
1273
+ content = await response.body()
1274
+ with open(path, 'wb') as f:
1275
+ f.write(content)
1276
+ return path
1277
+ else:
1278
+ logger.error(f"Download failed with status {response.status}: {file_url}")
1279
+
1280
+ # Try to extract error information
1281
+ error_info = await response.text()
1282
+ logger.debug(f"Error response: {error_info[:200]}...")
1283
+
1284
+ # Check if this might be a captcha or login issue
1285
+ if detect_captcha(error_info):
1286
+ logger.warning("Captcha detected during download")
1287
+ # For HF Spaces, we can't implement browser-based captcha solving here
1288
+ # Just log the issue for now
1289
+ except PlaywrightTimeoutError:
1290
+ logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}")
1291
+
1292
+ # Try an alternative approach - using the browser's download manager
1293
+ try:
1294
+ logger.info("Trying browser download manager approach")
1295
+ download_promise = page.wait_for_event("download")
1296
+ await page.goto(file_url, timeout=60000)
1297
+
1298
+ # Wait for download to start (with timeout)
1299
+ download = await download_promise
1300
+ await download.save_as(path)
1301
+
1302
+ if os.path.exists(path) and os.path.getsize(path) > 0:
1303
+ return path
1304
+ except Exception as e:
1305
+ logger.error(f"Browser download manager approach failed: {e}")
1306
+
1307
+ return None
1308
  except Exception as e:
1309
  logger.error(f"Error downloading {file_url}: {e}")
1310
  return None
 
1334
 
1335
  logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
1336
 
1337
+ # Create a dedicated browser instance with better resolution and stealth
1338
+ browser_args = [
1339
+ '--no-sandbox',
1340
+ '--disable-setuid-sandbox',
1341
+ '--disable-dev-shm-usage',
1342
+ '--disable-web-security',
1343
+ '--disable-features=IsolateOrigins,site-per-process',
1344
+ '--disable-site-isolation-trials',
1345
+ '--disable-blink-features=AutomationControlled' # Anti-detection
1346
+ ]
1347
+
1348
  browser = await self.playwright.chromium.launch(
1349
  headless=True,
1350
+ args=browser_args
 
 
 
 
 
 
 
1351
  )
1352
 
1353
  # Use higher resolution for better quality
 
1358
  accept_downloads=True # Critical for the download workflow
1359
  )
1360
 
1361
+ # Add anti-detection script
1362
+ await context.add_init_script("""
1363
+ () => {
1364
+ Object.defineProperty(navigator, 'webdriver', {
1365
+ get: () => false,
1366
+ });
1367
+
1368
+ // Change plugins
1369
+ Object.defineProperty(navigator, 'plugins', {
1370
+ get: () => [1, 2, 3, 4, 5].map(() => ({
1371
+ lengthComputable: true,
1372
+ loaded: 100,
1373
+ total: 100
1374
+ }))
1375
+ });
1376
+
1377
+ // Handle languages
1378
+ Object.defineProperty(navigator, 'languages', {
1379
+ get: () => ['en-US', 'en', 'es']
1380
+ });
1381
+
1382
+ // Modify hardware concurrency
1383
+ Object.defineProperty(navigator, 'hardwareConcurrency', {
1384
+ get: () => 4
1385
+ });
1386
+ }
1387
+ """)
1388
+
1389
  page = await context.new_page()
1390
 
1391
  try:
 
1393
  logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
1394
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
1395
  await page.wait_for_load_state('networkidle')
1396
+
1397
+ # Check for any barriers or permissions issues
1398
+ content = await page.content()
1399
+ if "the owner has not granted you permission to" in content:
1400
+ logger.warning("Permission denied error detected")
1401
+
1402
+ # Randomized wait to appear more human-like
1403
+ await page.wait_for_timeout(random.randint(3000, 7000))
1404
 
1405
  # Create temp directory
1406
  temp_dir = tempfile.mkdtemp()
 
1409
  if file_type.lower() == 'pdf':
1410
  # Use the improved scrolling and detection approach
1411
 
1412
+ # Perform some natural mouse movements and scrolling
1413
+ await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400))
1414
+ await page.wait_for_timeout(random.randint(500, 1000))
1415
+
1416
+ # Estimate number of pages
1417
  estimated_pages = await page.evaluate("""
1418
  () => {
1419
  // Method 1: Check page counter text
 
1443
 
1444
  logger.info(f"Estimated {estimated_pages} pages in PDF")
1445
 
1446
+ # Initial scroll to trigger lazy loading
1447
+ logger.info("Initial scroll to bottom to trigger lazy loading...")
 
 
1448
  await page.keyboard.press("End")
1449
  await page.wait_for_timeout(3000)
1450
 
1451
  # Scroll page by page to ensure all pages are loaded
1452
+ logger.info("Scrolling page by page...")
1453
  max_attempts = min(estimated_pages * 3, 300)
1454
  attempt = 0
1455
  prev_blob_count = 0
 
1467
  logger.info("All pages appear to be loaded.")
1468
  break
1469
 
1470
+ # Alternate between PageDown and End keys for more natural scrolling
1471
+ if attempt % 3 == 0:
1472
+ await page.keyboard.press("End")
1473
+ else:
1474
+ await page.keyboard.press("PageDown")
1475
+
1476
+ # Randomized wait times
1477
+ await page.wait_for_timeout(random.randint(1500, 3000))
1478
+
1479
+ # Move mouse randomly to appear more human-like
1480
+ if attempt % 4 == 0:
1481
+ await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800))
1482
+
1483
  prev_blob_count = blob_count
1484
  attempt += 1
1485
 
 
1545
 
1546
  if not result.get('success', False):
1547
  logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
1548
+
1549
+ # Try fallback approach - screenshot method
1550
+ logger.info("Trying fallback screenshot method...")
1551
+
1552
+ # Navigate back to the first page
1553
+ await page.evaluate("""
1554
+ () => {
1555
+ // Find and click the "first page" button if available
1556
+ const buttons = Array.from(document.querySelectorAll('button'));
1557
+ const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page'));
1558
+ if (firstPageBtn) firstPageBtn.click();
1559
+ }
1560
+ """)
1561
+ await page.wait_for_timeout(1000);
1562
+
1563
+ # Create a PDF by taking screenshots of each page
1564
+ screenshots = []
1565
+ current_page = 1
1566
+ max_pages = estimated_pages
1567
+
1568
+ # Create a PDF using the reportlab package
1569
+ while current_page <= max_pages:
1570
+ screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png")
1571
+
1572
+ # Try to find the current page element
1573
+ page_elem = await page.query_selector('.drive-viewer-paginated-page')
1574
+ if page_elem:
1575
+ await page_elem.screenshot(path=screenshot_path)
1576
+ else:
1577
+ # Fallback to full page screenshot
1578
+ await page.screenshot(path=screenshot_path)
1579
+
1580
+ screenshots.append(screenshot_path)
1581
+
1582
+ # Try to navigate to next page
1583
+ next_btn = await page.query_selector('button[aria-label="Next page"]')
1584
+ if next_btn:
1585
+ is_disabled = await next_btn.get_attribute('disabled')
1586
+ if is_disabled:
1587
+ logger.info(f"Reached end of document at page {current_page}")
1588
+ break
1589
+
1590
+ await next_btn.click()
1591
+ await page.wait_for_timeout(1000)
1592
+ current_page += 1
1593
+ else:
1594
+ break
1595
+
1596
+ # Create PDF from screenshots
1597
+ if screenshots:
1598
+ first_img = Image.open(screenshots[0])
1599
+ width, height = first_img.size
1600
+
1601
+ c = canvas.Canvas(save_path, pagesize=(width, height))
1602
+ for screenshot in screenshots:
1603
+ img = Image.open(screenshot)
1604
+ c.drawImage(screenshot, 0, 0, width, height)
1605
+ c.showPage()
1606
+ c.save()
1607
+
1608
+ # Clean up screenshots
1609
+ for screenshot in screenshots:
1610
+ os.remove(screenshot)
1611
+
1612
+ return save_path
1613
+
1614
  return None
1615
 
1616
  logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
 
1712
 
1713
  # Try standard approaches for non-view-only files
1714
  try:
1715
+ # Try direct download link first (fastest)
1716
+ direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
1717
+
1718
+ # Add anti-bot headers
1719
+ headers = {
1720
+ 'User-Agent': get_random_user_agent(),
1721
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
1722
+ 'Accept-Language': 'en-US,en;q=0.9',
1723
+ 'Referer': 'https://drive.google.com/',
1724
+ 'DNT': '1'
1725
+ }
1726
+
1727
+ # Try with streaming to handle larger files
1728
+ with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r:
1729
+ if r.status_code == 200:
1730
+ # Check if we got HTML instead of the file
1731
+ content_type = r.headers.get('Content-Type', '')
1732
+ if 'text/html' in content_type and not file_id.endswith('.html'):
1733
+ logger.warning("Received HTML instead of file, trying with session cookies")
1734
+ else:
1735
+ # Looks like we got the actual file
1736
+ with open(save_path, 'wb') as f:
1737
+ for chunk in r.iter_content(chunk_size=8192):
1738
+ if chunk:
1739
+ f.write(chunk)
1740
+
1741
+ # Verify file exists and has content
1742
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
1743
+ logger.info("Direct download successful")
1744
+ return True
1745
+
1746
  # Try with requests and session cookies
1747
  session = requests.Session()
1748
  session.headers.update({'User-Agent': get_random_user_agent()})
 
1785
  except Exception as e:
1786
  logger.warning(f"Requests session download failed: {e}")
1787
 
1788
+ # Try browser-based approach as last resort
1789
+ try:
1790
+ async with self.context.new_page() as page:
1791
+ # Visit the file view page first to get cookies
1792
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
1793
+ await page.wait_for_timeout(3000)
1794
+
1795
+ # Set up download event listener
1796
+ download_promise = page.wait_for_event("download")
1797
+
1798
+ # Try to trigger the download button click
1799
+ download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]')
1800
+ if download_button:
1801
+ await download_button.click()
1802
+
1803
+ # Wait for download to start
1804
+ try:
1805
+ download = await download_promise
1806
+ await download.save_as(save_path)
1807
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1808
+ except Exception as e:
1809
+ logger.error(f"Error during browser download: {e}")
1810
+ return False
1811
+ else:
1812
+ # Try the export download URL
1813
+ await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000)
1814
+
1815
+ # Look for and click any download buttons or links
1816
+ download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")')
1817
+ for elem in download_elements:
1818
+ try:
1819
+ await elem.click()
1820
+ # Wait a bit to see if download starts
1821
+ try:
1822
+ download = await download_promise
1823
+ await download.save_as(save_path)
1824
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
1825
+ except:
1826
+ pass
1827
+ except:
1828
+ continue
1829
+ except Exception as e:
1830
+ logger.error(f"Browser-based download attempt failed: {e}")
1831
+
1832
+ logger.warning("All standard download methods failed")
1833
  return False
1834
 
1835
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
1836
  """Download view-only PDF using the enhanced blob image caching technique"""
1837
  try:
1838
+ # Create a dedicated browser instance with stealth capabilities
1839
+ browser_args = [
1840
+ '--no-sandbox',
1841
+ '--disable-setuid-sandbox',
1842
+ '--disable-dev-shm-usage',
1843
+ '--disable-web-security',
1844
+ '--disable-blink-features=AutomationControlled' # Anti-detection
1845
+ ]
1846
+
1847
  browser = await self.playwright.chromium.launch(
1848
  headless=True,
1849
+ args=browser_args
 
 
 
 
 
1850
  )
1851
 
1852
+ # Setup stealth context
1853
  context = await browser.new_context(
1854
  viewport={'width': 1600, 'height': 1200},
1855
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1856
+ accept_downloads=True, # Critical for handling the download event
1857
+ ignore_https_errors=True
1858
  )
1859
 
1860
+ # Add stealth script
1861
+ await context.add_init_script("""
1862
+ () => {
1863
+ Object.defineProperty(navigator, 'webdriver', {
1864
+ get: () => false,
1865
+ });
1866
+
1867
+ // Change plugins and languages to appear more human
1868
+ Object.defineProperty(navigator, 'plugins', {
1869
+ get: () => [1, 2, 3, 4, 5].map(() => ({
1870
+ lengthComputable: true,
1871
+ loaded: 100,
1872
+ total: 100
1873
+ }))
1874
+ });
1875
+
1876
+ Object.defineProperty(navigator, 'languages', {
1877
+ get: () => ['en-US', 'en', 'es']
1878
+ });
1879
+ }
1880
+ """)
1881
+
1882
  page = await context.new_page()
1883
 
1884
  try:
1885
+ # Step 1: Navigate to the file with human-like behavior
1886
  logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
1887
  await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
1888
  await page.wait_for_load_state('networkidle')
1889
+
1890
+ # Perform human-like interactions
1891
+ await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300))
1892
+ await page.wait_for_timeout(random.randint(2000, 5000))
1893
 
1894
  # Step 2: Estimate the number of pages
1895
  estimated_pages = await page.evaluate("""
 
1922
  await page.keyboard.press("End")
1923
  await page.wait_for_timeout(3000)
1924
 
1925
+ # Step 4: Wait for all pages to load with better feedback and randomization
1926
+ logger.info("Scrolling through document to load all pages...")
1927
+ max_attempts = min(estimated_pages * 3, 300)
1928
  attempt = 0
1929
  prev_blob_count = 0
1930
+ consecutive_same_count = 0
1931
 
1932
  while attempt < max_attempts:
1933
  # Count blob images (which are the PDF pages)
 
1939
 
1940
  logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
1941
 
1942
+ # Check if we've loaded all pages or if we're stuck
1943
+ if blob_count >= estimated_pages:
1944
+ logger.info(f"All {estimated_pages} pages appear to be loaded.")
1945
  break
1946
 
1947
+ if blob_count == prev_blob_count:
1948
+ consecutive_same_count += 1
1949
+ if consecutive_same_count >= 5 and blob_count > 0:
1950
+ logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.")
1951
+ break
1952
+ else:
1953
+ consecutive_same_count = 0
1954
+
1955
+ # Mix up the scrolling approach for more human-like behavior
1956
+ scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"])
1957
+
1958
+ if scroll_action == "PageDown":
1959
+ await page.keyboard.press("PageDown")
1960
+ elif scroll_action == "End":
1961
+ await page.keyboard.press("End")
1962
+ elif scroll_action == "ArrowDown":
1963
+ # Press arrow down multiple times
1964
+ for _ in range(random.randint(5, 15)):
1965
+ await page.keyboard.press("ArrowDown")
1966
+ await page.wait_for_timeout(random.randint(50, 150))
1967
+ else: # mouse
1968
+ # Scroll using mouse wheel
1969
+ current_y = random.randint(300, 700)
1970
+ await page.mouse.move(x=random.randint(300, 800), y=current_y)
1971
+ await page.mouse.wheel(0, random.randint(300, 800))
1972
+
1973
+ # Random wait between scrolls
1974
+ await page.wait_for_timeout(random.randint(1000, 3000))
1975
+
1976
  prev_blob_count = blob_count
1977
  attempt += 1
1978
 
 
1992
  try {
1993
  let pdf = new jsPDF();
1994
  let imgs = document.getElementsByTagName("img");
 
 
 
1995
  let validImages = [];
1996
+
1997
+ // First collect all valid blob images
1998
  for (let i = 0; i < imgs.length; i++) {
1999
  let img = imgs[i];
2000
  if (!/^blob:/.test(img.src)) continue;
 
2002
  validImages.push(img);
2003
  }
2004
 
2005
+ // Sort by position in the document
2006
  validImages.sort((a, b) => {
2007
  const rectA = a.getBoundingClientRect();
2008
  const rectB = b.getBoundingClientRect();
 
2011
 
2012
  console.log(`Found ${validImages.length} valid page images to add to PDF`);
2013
 
2014
+ let added = 0;
2015
  // Process each image as a page
2016
  for (let i = 0; i < validImages.length; i++) {
2017
  let img = validImages[i];
 
2326
  logger.info(f"Found {len(links)} sublinks with specialized method")
2327
  return list(links)[:limit]
2328
 
2329
+ # Rotate proxy if needed
2330
+ await self.rotate_proxy_if_needed()
2331
+
2332
  # Standard sublink extraction for all sites
2333
  await self.page.goto(url, timeout=30000, wait_until='networkidle')
2334
 
 
2337
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
2338
  path_base = os.path.dirname(parsed_base.path)
2339
 
2340
+ # Perform initial scrolling to load lazy content
2341
+ await self.page.evaluate("""
2342
+ async () => {
2343
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
2344
+ const height = document.body.scrollHeight;
2345
+ const step = Math.floor(window.innerHeight / 2);
2346
+
2347
+ for (let i = 0; i < height; i += step) {
2348
+ window.scrollTo(0, i);
2349
+ await delay(150);
2350
+ }
2351
+
2352
+ window.scrollTo(0, 0);
2353
+ }
2354
+ """)
2355
+ await self.page.wait_for_timeout(1000)
2356
+
2357
  # Check if page has ASP.NET elements which might need special handling
2358
  is_aspnet = await self.page.evaluate('''
2359
  () => {
 
2476
  except Exception as e:
2477
  logger.warning(f"Error with postback: {e}")
2478
 
2479
+ # Look for pagination controls and try to navigate through them
2480
+ pagination_elements = await self.page.query_selector_all(
2481
+ 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]'
2482
+ )
2483
+
2484
+ # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops)
2485
+ for i in range(min(5, len(pagination_elements))):
2486
+ try:
2487
+ # Focus on elements that look like "next page" buttons
2488
+ el = pagination_elements[i]
2489
+ el_text = await el.text_content() or ""
2490
+
2491
+ # Only click if this looks like a pagination control
2492
+ if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip():
2493
+ logger.info(f"Clicking pagination control: {el_text}")
2494
+ await el.click()
2495
+ await self.page.wait_for_timeout(2000)
2496
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
2497
+
2498
+ # Get new links from this page
2499
+ await self.extract_all_link_types(links, base_url, path_base)
2500
+ except Exception as e:
2501
+ logger.warning(f"Error clicking pagination: {e}")
2502
+
2503
+ # Check for hidden links that might be revealed by JavaScript
2504
+ hidden_links = await self.page.evaluate("""
2505
+ () => {
2506
+ // Try to execute common JavaScript patterns that reveal hidden content
2507
+ try {
2508
+ // Common patterns used in websites to initially hide content
2509
+ const hiddenContainers = document.querySelectorAll(
2510
+ '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]'
2511
+ );
2512
+
2513
+ // Attempt to make them visible
2514
+ hiddenContainers.forEach(el => {
2515
+ el.style.display = 'block';
2516
+ el.style.visibility = 'visible';
2517
+ el.classList.remove('hidden', 'hide');
2518
+ });
2519
+
2520
+ // Return any newly visible links
2521
+ return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
2522
+ } catch (e) {
2523
+ return [];
2524
+ }
2525
+ }
2526
+ """)
2527
+
2528
+ # Add any newly discovered links
2529
+ for href in hidden_links:
2530
+ if href and not href.startswith('javascript:'):
2531
+ links.add(href)
2532
+
2533
  logger.info(f"Found {len(links)} sublinks")
2534
  return list(links)[:limit]
2535
 
 
2594
  links_set.add(full_url)
2595
  except Exception:
2596
  pass
2597
+
2598
+ # Extract links from JSON data embedded in the page
2599
+ script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]')
2600
+ for script in script_elements:
2601
+ try:
2602
+ script_content = await script.text_content()
2603
+ if script_content:
2604
+ # Look for URLs in the JSON content
2605
+ urls = re.findall(r'(https?://[^\'"]+)', script_content)
2606
+ for url in urls:
2607
+ links_set.add(url)
2608
+ except Exception:
2609
+ pass
2610
 
2611
  def resolve_relative_url(self, relative_url, base_url, path_base):
2612
  """Properly resolve relative URLs considering multiple formats"""
 
2657
  total_links = len(sublinks)
2658
  progress_text.text(f"Found {total_links} sublinks to process")
2659
 
2660
+ # Always include files from the main page, regardless of sublinks
2661
+ all_files = main_files
2662
+
2663
  if not sublinks:
2664
  progress_bar.progress(1.0)
2665
+ return all_files
2666
 
2667
  # Process each sublink
 
2668
  for i, sublink in enumerate(sublinks, 1):
2669
  progress = i / total_links
2670
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
 
2734
  sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
2735
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
2736
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
2737
+ use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox")
2738
 
2739
  with st.expander("Google Drive Integration", expanded=False):
2740
  if st.button("Start Google Sign-In", key="google_signin_btn"):
 
2745
  creds, msg = exchange_code_for_credentials(auth_code)
2746
  st.session_state.google_creds = creds
2747
  st.write(msg)
2748
+
2749
+ with st.expander("Advanced Browser Settings", expanded=False):
2750
+ # Captcha handling options
2751
+ st.write("**Captcha Handling**")
2752
+ captcha_option = st.radio(
2753
+ "Captcha Detection:",
2754
+ ["Auto-detect only", "Manual solve (shows captcha)"],
2755
+ index=0,
2756
+ key="captcha_option"
2757
+ )
2758
+
2759
+ # Proxy rotation settings
2760
+ st.write("**Proxy Rotation**")
2761
+ enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation")
2762
+ if enable_rotation:
2763
+ PROXY_ROTATION_CONFIG["enabled"] = True
2764
+ proxy_list = st.text_area(
2765
+ "Proxy List (one per line)",
2766
+ placeholder="http://proxy1:port\nhttp://proxy2:port",
2767
+ key="proxy_list"
2768
+ )
2769
+ if proxy_list:
2770
+ PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()]
2771
+ rotation_interval = st.slider(
2772
+ "Rotation Interval (# of requests)",
2773
+ min_value=1,
2774
+ max_value=50,
2775
+ value=10,
2776
+ key="rotation_interval"
2777
+ )
2778
+ PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
2779
 
2780
  if mode == "Manual URL":
2781
  st.header("Manual URL Mode")
 
2790
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
2791
 
2792
  @st.cache_resource
2793
+ def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
2794
  async def _run():
2795
+ async with DownloadManager(
2796
+ use_proxy=use_proxy_val,
2797
+ proxy=proxy_val,
2798
+ use_stealth=use_stealth_val
2799
+ ) as dm:
2800
  files = await dm.deep_search(url, ext_list, max_links, timeout_val)
2801
  return files
2802
  return asyncio.run(_run())
2803
 
2804
  with st.spinner("Searching for files..."):
2805
  files = run_deep_search(url, valid_ext_list, max_sublinks,
2806
+ sublink_timeout, use_proxy, proxy, use_stealth)
2807
 
2808
  if files:
2809
  st.session_state.discovered_files = files
 
2866
  progress_bar = st.progress(0)
2867
  status_text = st.empty()
2868
 
2869
+ async with DownloadManager(
2870
+ use_proxy=use_proxy,
2871
+ proxy=proxy,
2872
+ use_stealth=use_stealth
2873
+ ) as dm:
2874
  for i, idx in enumerate(selected_indices):
2875
  progress = (i + 1) / len(selected_indices)
2876
  file_info = files[idx]
 
2951
  if st.button("Search", key="search_btn"):
2952
  if query:
2953
  async def run_search():
2954
+ async with DownloadManager(
2955
+ use_proxy=use_proxy,
2956
+ proxy=proxy,
2957
+ query=query,
2958
+ num_results=num_results,
2959
+ use_stealth=use_stealth
2960
+ ) as dm:
2961
  with st.spinner("Searching..."):
2962
  urls = await dm.search_bing()
2963
  if urls:
 
2988
  valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
2989
 
2990
  @st.cache_resource
2991
+ def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
2992
  async def _run():
2993
+ async with DownloadManager(
2994
+ use_proxy=use_proxy_val,
2995
+ proxy=proxy_val,
2996
+ use_stealth=use_stealth_val
2997
+ ) as dm:
2998
  files = await dm.deep_search(url, ext_list, max_links, timeout_val)
2999
  return files
3000
  return asyncio.run(_run())
3001
 
3002
  with st.spinner("Searching for files..."):
3003
  files = run_deep_search(url, valid_ext_list, max_sublinks,
3004
+ sublink_timeout, use_proxy, proxy, use_stealth)
3005
 
3006
  if files:
3007
  st.session_state.discovered_files = files
 
3025
 
3026
  with st.spinner("Downloading view-only document... (this may take a minute)"):
3027
  async def download_viewonly():
3028
+ async with DownloadManager(use_stealth=use_stealth) as dm:
3029
  file_info = {
3030
  'url': f"https://drive.google.com/file/d/{file_id}/view",
3031
  'filename': f"gdrive_{file_id}.pdf",
 
3038
 
3039
  if result:
3040
  st.success("Document downloaded successfully!")
3041
+
3042
+ # Provide download button
3043
  with open(result, "rb") as f:
3044
  file_bytes = f.read()
3045
+
3046
  st.download_button(
3047
  label="Download PDF",
3048
  data=file_bytes,
3049
+ file_name=f"gdrive_{file_id}.pdf",
3050
  mime="application/pdf"
3051
  )
3052
  else:
 
3054
 
3055
  # Add footer with attribution
3056
  st.markdown('---')
3057
+ st.markdown('Created by [Euler314](https://github.com/euler314)')
3058
 
3059
  if __name__ == "__main__":
3060
  main()