Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,8 @@ import datetime
|
|
22 |
import traceback
|
23 |
import base64
|
24 |
import shutil
|
|
|
|
|
25 |
from PIL import Image
|
26 |
from reportlab.lib.pagesizes import letter
|
27 |
from reportlab.pdfgen import canvas
|
@@ -50,14 +52,44 @@ GOOGLE_OAUTH_CONFIG = {
|
|
50 |
}
|
51 |
}
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
# -------------------- Utility Functions --------------------
|
54 |
def get_random_user_agent():
|
55 |
-
USER_AGENTS = [
|
56 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
57 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
58 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
59 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
60 |
-
]
|
61 |
return random.choice(USER_AGENTS)
|
62 |
|
63 |
def sizeof_fmt(num, suffix='B'):
|
@@ -75,6 +107,42 @@ def create_zip_file(file_paths, output_dir):
|
|
75 |
zipf.write(file_path, os.path.basename(file_path))
|
76 |
return zip_path
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
# -------------------- Google Drive Functions --------------------
|
79 |
def get_google_auth_url():
|
80 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
@@ -153,7 +221,7 @@ def install_playwright_dependencies():
|
|
153 |
|
154 |
# -------------------- Download Manager Class --------------------
|
155 |
class DownloadManager:
|
156 |
-
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
157 |
self.use_proxy = use_proxy
|
158 |
self.proxy = proxy
|
159 |
self.query = query
|
@@ -162,30 +230,140 @@ class DownloadManager:
|
|
162 |
self.browser = None
|
163 |
self.context = None
|
164 |
self.page = None
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
async def __aenter__(self):
|
167 |
self.playwright = await async_playwright().start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
opts = {
|
169 |
"headless": True,
|
170 |
-
"args":
|
171 |
-
'--no-sandbox',
|
172 |
-
'--disable-setuid-sandbox',
|
173 |
-
'--disable-dev-shm-usage',
|
174 |
-
'--disable-gpu',
|
175 |
-
'--no-zygote',
|
176 |
-
'--single-process'
|
177 |
-
]
|
178 |
}
|
|
|
|
|
179 |
if self.use_proxy and self.proxy:
|
180 |
opts["proxy"] = {"server": self.proxy}
|
|
|
|
|
181 |
self.browser = await self.playwright.chromium.launch(**opts)
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
self.page = await self.context.new_page()
|
184 |
await self.page.set_extra_http_headers({
|
185 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
186 |
'Accept-Encoding': 'gzip, deflate, br',
|
187 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
return self
|
190 |
|
191 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
@@ -194,17 +372,140 @@ class DownloadManager:
|
|
194 |
if self.playwright:
|
195 |
await self.playwright.stop()
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
async def search_bing(self):
|
198 |
urls = []
|
199 |
try:
|
|
|
|
|
|
|
200 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
201 |
await self.page.goto(search_url, timeout=30000)
|
202 |
await self.page.wait_for_load_state('networkidle')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
204 |
for link in links[:self.num_results]:
|
205 |
href = await link.get_attribute('href')
|
206 |
if href:
|
207 |
urls.append(href)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
return urls
|
209 |
except Exception as e:
|
210 |
logger.error(f"Error searching Bing: {e}")
|
@@ -212,6 +513,8 @@ class DownloadManager:
|
|
212 |
|
213 |
async def get_file_size(self, url):
|
214 |
try:
|
|
|
|
|
215 |
async with self.context.new_page() as page:
|
216 |
response = await page.request.head(url, timeout=15000)
|
217 |
length = response.headers.get('Content-Length', None)
|
@@ -219,11 +522,14 @@ class DownloadManager:
|
|
219 |
return sizeof_fmt(int(length))
|
220 |
else:
|
221 |
return "Unknown Size"
|
222 |
-
except Exception:
|
|
|
223 |
return "Unknown Size"
|
224 |
|
225 |
async def get_pdf_metadata(self, url):
|
226 |
try:
|
|
|
|
|
227 |
async with self.context.new_page() as page:
|
228 |
resp = await page.request.get(url, timeout=15000)
|
229 |
if resp.ok:
|
@@ -237,11 +543,14 @@ class DownloadManager:
|
|
237 |
}
|
238 |
else:
|
239 |
return {}
|
240 |
-
except Exception:
|
|
|
241 |
return {}
|
242 |
|
243 |
async def extract_real_download_url(self, url):
|
244 |
try:
|
|
|
|
|
245 |
async with self.context.new_page() as page:
|
246 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
247 |
if response and response.headers.get('location'):
|
@@ -258,8 +567,15 @@ class DownloadManager:
|
|
258 |
logger.info(f"Fetching exam links from {url}")
|
259 |
links = set()
|
260 |
|
261 |
-
#
|
262 |
-
headers = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
try:
|
264 |
response = requests.get(url, headers=headers, timeout=30)
|
265 |
|
@@ -274,77 +590,195 @@ class DownloadManager:
|
|
274 |
href = a["href"]
|
275 |
full_url = urljoin(url, href)
|
276 |
|
277 |
-
#
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
except Exception as e:
|
285 |
logger.warning(f"Request-based extraction failed: {e}")
|
286 |
|
287 |
-
#
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
292 |
|
293 |
-
#
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
() => {
|
296 |
-
const
|
|
|
|
|
297 |
const anchors = document.querySelectorAll('a[href]');
|
298 |
for (const a of anchors) {
|
299 |
if (a.href) {
|
300 |
-
|
301 |
href: a.href,
|
302 |
-
text: a.innerText || a.textContent || ''
|
|
|
303 |
});
|
304 |
}
|
305 |
}
|
306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
}
|
308 |
""")
|
309 |
|
310 |
-
# Process extracted links
|
311 |
-
for link_info in
|
312 |
href = link_info.get('href', '')
|
313 |
text = link_info.get('text', '').lower()
|
314 |
|
315 |
-
if href:
|
316 |
-
# Check
|
317 |
-
url_patterns = [
|
318 |
-
|
319 |
-
|
|
|
|
|
|
|
320 |
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
if any(pattern in href.lower() for pattern in url_patterns) or \
|
324 |
-
any(pattern in text for pattern in text_patterns)
|
|
|
|
|
325 |
links.add(href)
|
326 |
|
327 |
# Check for ASP.NET specific elements that might contain exam links
|
328 |
-
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
|
329 |
for grid in grid_elements:
|
330 |
grid_links = await grid.query_selector_all('a[href]')
|
331 |
for a in grid_links:
|
332 |
href = await a.get_attribute('href')
|
|
|
|
|
333 |
if href:
|
334 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
335 |
links.add(full_url)
|
336 |
|
337 |
-
# Try clicking
|
338 |
-
|
339 |
-
for button in buttons
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
try:
|
|
|
345 |
await button.click()
|
346 |
-
await self.page.wait_for_timeout(
|
347 |
-
await self.page.wait_for_load_state('networkidle', timeout=
|
348 |
|
349 |
# Get any new links that appeared
|
350 |
new_links = await self.page.query_selector_all('a[href]')
|
@@ -352,24 +786,67 @@ class DownloadManager:
|
|
352 |
href = await a.get_attribute('href')
|
353 |
if href:
|
354 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
356 |
except Exception as e:
|
357 |
logger.warning(f"Error clicking button: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
except Exception as e:
|
359 |
-
logger.
|
|
|
|
|
|
|
360 |
|
361 |
# Filter links to likely contain exam documents
|
362 |
filtered_links = []
|
363 |
for link in links:
|
364 |
# Common file extensions for exam documents
|
365 |
-
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
|
366 |
filtered_links.append(link)
|
367 |
continue
|
368 |
|
369 |
# Common paths for exam documents
|
370 |
if any(pattern in link.lower() for pattern in [
|
371 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
372 |
-
"/pastpapers/", "/questionpapers/", "/tests/"
|
|
|
373 |
]):
|
374 |
filtered_links.append(link)
|
375 |
|
@@ -383,6 +860,9 @@ class DownloadManager:
|
|
383 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
384 |
found_files = []
|
385 |
try:
|
|
|
|
|
|
|
386 |
# Special handling for educational exam sites
|
387 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
388 |
["exam", "test", "pastpaper", "eduexp"]):
|
@@ -403,6 +883,12 @@ class DownloadManager:
|
|
403 |
except Exception:
|
404 |
pass
|
405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
# Get file size
|
407 |
size_str = await self.get_file_size(real_url)
|
408 |
|
@@ -429,14 +915,55 @@ class DownloadManager:
|
|
429 |
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
430 |
if not response:
|
431 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
final_url = self.page.url
|
434 |
if '.php' in final_url or 'download' in final_url:
|
435 |
real_url = await self.extract_real_download_url(final_url)
|
436 |
if real_url != final_url:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
found_files.append({
|
438 |
'url': real_url,
|
439 |
-
'filename':
|
440 |
'size': await self.get_file_size(real_url),
|
441 |
'metadata': {}
|
442 |
})
|
@@ -549,15 +1076,118 @@ class DownloadManager:
|
|
549 |
'metadata': meta
|
550 |
})
|
551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
552 |
seen_urls = set()
|
553 |
unique_files = []
|
554 |
for f in found_files:
|
555 |
if f['url'] not in seen_urls:
|
556 |
seen_urls.add(f['url'])
|
557 |
unique_files.append(f)
|
|
|
558 |
return unique_files
|
559 |
except Exception as e:
|
560 |
logger.error(f"Error extracting files from {url}: {e}")
|
|
|
561 |
return []
|
562 |
|
563 |
async def download_file(self, file_info, save_dir, referer):
|
@@ -596,23 +1226,85 @@ class DownloadManager:
|
|
596 |
logger.warning("All standard methods failed, attempting force download")
|
597 |
result_path = await self.force_download_viewonly(file_info, path)
|
598 |
return result_path if result_path else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
|
600 |
-
# Original code for non-Google Drive downloads
|
601 |
async with self.context.new_page() as page:
|
602 |
headers = {
|
603 |
'Accept': '*/*',
|
604 |
'Accept-Encoding': 'gzip, deflate, br',
|
605 |
'Referer': referer
|
606 |
}
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
except Exception as e:
|
617 |
logger.error(f"Error downloading {file_url}: {e}")
|
618 |
return None
|
@@ -642,17 +1334,20 @@ class DownloadManager:
|
|
642 |
|
643 |
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
|
644 |
|
645 |
-
# Create a dedicated browser instance with better resolution
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
browser = await self.playwright.chromium.launch(
|
647 |
headless=True,
|
648 |
-
args=
|
649 |
-
'--no-sandbox',
|
650 |
-
'--disable-setuid-sandbox',
|
651 |
-
'--disable-dev-shm-usage',
|
652 |
-
'--disable-web-security',
|
653 |
-
'--disable-features=IsolateOrigins,site-per-process',
|
654 |
-
'--disable-site-isolation-trials'
|
655 |
-
]
|
656 |
)
|
657 |
|
658 |
# Use higher resolution for better quality
|
@@ -663,6 +1358,34 @@ class DownloadManager:
|
|
663 |
accept_downloads=True # Critical for the download workflow
|
664 |
)
|
665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
page = await context.new_page()
|
667 |
|
668 |
try:
|
@@ -670,7 +1393,14 @@ class DownloadManager:
|
|
670 |
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
|
671 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
|
672 |
await page.wait_for_load_state('networkidle')
|
673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
674 |
|
675 |
# Create temp directory
|
676 |
temp_dir = tempfile.mkdtemp()
|
@@ -679,7 +1409,11 @@ class DownloadManager:
|
|
679 |
if file_type.lower() == 'pdf':
|
680 |
# Use the improved scrolling and detection approach
|
681 |
|
682 |
-
#
|
|
|
|
|
|
|
|
|
683 |
estimated_pages = await page.evaluate("""
|
684 |
() => {
|
685 |
// Method 1: Check page counter text
|
@@ -709,14 +1443,13 @@ class DownloadManager:
|
|
709 |
|
710 |
logger.info(f"Estimated {estimated_pages} pages in PDF")
|
711 |
|
712 |
-
#
|
713 |
-
logger.info("
|
714 |
-
|
715 |
-
# Initial scroll to bottom to trigger lazy loading
|
716 |
await page.keyboard.press("End")
|
717 |
await page.wait_for_timeout(3000)
|
718 |
|
719 |
# Scroll page by page to ensure all pages are loaded
|
|
|
720 |
max_attempts = min(estimated_pages * 3, 300)
|
721 |
attempt = 0
|
722 |
prev_blob_count = 0
|
@@ -734,8 +1467,19 @@ class DownloadManager:
|
|
734 |
logger.info("All pages appear to be loaded.")
|
735 |
break
|
736 |
|
737 |
-
|
738 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
prev_blob_count = blob_count
|
740 |
attempt += 1
|
741 |
|
@@ -801,6 +1545,72 @@ class DownloadManager:
|
|
801 |
|
802 |
if not result.get('success', False):
|
803 |
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
804 |
return None
|
805 |
|
806 |
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
|
@@ -902,6 +1712,37 @@ class DownloadManager:
|
|
902 |
|
903 |
# Try standard approaches for non-view-only files
|
904 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
905 |
# Try with requests and session cookies
|
906 |
session = requests.Session()
|
907 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
@@ -944,37 +1785,111 @@ class DownloadManager:
|
|
944 |
except Exception as e:
|
945 |
logger.warning(f"Requests session download failed: {e}")
|
946 |
|
947 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
948 |
return False
|
949 |
|
950 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
951 |
"""Download view-only PDF using the enhanced blob image caching technique"""
|
952 |
try:
|
953 |
-
# Create a dedicated browser instance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
954 |
browser = await self.playwright.chromium.launch(
|
955 |
headless=True,
|
956 |
-
args=
|
957 |
-
'--no-sandbox',
|
958 |
-
'--disable-setuid-sandbox',
|
959 |
-
'--disable-dev-shm-usage',
|
960 |
-
'--disable-web-security'
|
961 |
-
]
|
962 |
)
|
963 |
|
|
|
964 |
context = await browser.new_context(
|
965 |
viewport={'width': 1600, 'height': 1200},
|
966 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
967 |
-
accept_downloads=True # Critical for handling the download event
|
|
|
968 |
)
|
969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
970 |
page = await context.new_page()
|
971 |
|
972 |
try:
|
973 |
-
# Step 1: Navigate to the file
|
974 |
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
|
975 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
|
976 |
await page.wait_for_load_state('networkidle')
|
977 |
-
|
|
|
|
|
|
|
978 |
|
979 |
# Step 2: Estimate the number of pages
|
980 |
estimated_pages = await page.evaluate("""
|
@@ -1007,11 +1922,12 @@ class DownloadManager:
|
|
1007 |
await page.keyboard.press("End")
|
1008 |
await page.wait_for_timeout(3000)
|
1009 |
|
1010 |
-
# Step 4: Wait for all pages to load
|
1011 |
-
logger.info("
|
1012 |
-
max_attempts = min(estimated_pages * 3, 300)
|
1013 |
attempt = 0
|
1014 |
prev_blob_count = 0
|
|
|
1015 |
|
1016 |
while attempt < max_attempts:
|
1017 |
# Count blob images (which are the PDF pages)
|
@@ -1023,14 +1939,40 @@ class DownloadManager:
|
|
1023 |
|
1024 |
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
|
1025 |
|
1026 |
-
#
|
1027 |
-
if blob_count >= estimated_pages
|
1028 |
-
logger.info("All pages appear to be loaded.")
|
1029 |
break
|
1030 |
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1034 |
prev_blob_count = blob_count
|
1035 |
attempt += 1
|
1036 |
|
@@ -1050,10 +1992,9 @@ class DownloadManager:
|
|
1050 |
try {
|
1051 |
let pdf = new jsPDF();
|
1052 |
let imgs = document.getElementsByTagName("img");
|
1053 |
-
let added = 0;
|
1054 |
-
|
1055 |
-
// First collect and sort all valid blob images
|
1056 |
let validImages = [];
|
|
|
|
|
1057 |
for (let i = 0; i < imgs.length; i++) {
|
1058 |
let img = imgs[i];
|
1059 |
if (!/^blob:/.test(img.src)) continue;
|
@@ -1061,7 +2002,7 @@ class DownloadManager:
|
|
1061 |
validImages.push(img);
|
1062 |
}
|
1063 |
|
1064 |
-
// Sort by
|
1065 |
validImages.sort((a, b) => {
|
1066 |
const rectA = a.getBoundingClientRect();
|
1067 |
const rectB = b.getBoundingClientRect();
|
@@ -1070,6 +2011,7 @@ class DownloadManager:
|
|
1070 |
|
1071 |
console.log(`Found ${validImages.length} valid page images to add to PDF`);
|
1072 |
|
|
|
1073 |
// Process each image as a page
|
1074 |
for (let i = 0; i < validImages.length; i++) {
|
1075 |
let img = validImages[i];
|
@@ -1384,6 +2326,9 @@ class DownloadManager:
|
|
1384 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
1385 |
return list(links)[:limit]
|
1386 |
|
|
|
|
|
|
|
1387 |
# Standard sublink extraction for all sites
|
1388 |
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
1389 |
|
@@ -1392,6 +2337,23 @@ class DownloadManager:
|
|
1392 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
1393 |
path_base = os.path.dirname(parsed_base.path)
|
1394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1395 |
# Check if page has ASP.NET elements which might need special handling
|
1396 |
is_aspnet = await self.page.evaluate('''
|
1397 |
() => {
|
@@ -1514,6 +2476,60 @@ class DownloadManager:
|
|
1514 |
except Exception as e:
|
1515 |
logger.warning(f"Error with postback: {e}")
|
1516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1517 |
logger.info(f"Found {len(links)} sublinks")
|
1518 |
return list(links)[:limit]
|
1519 |
|
@@ -1578,6 +2594,19 @@ class DownloadManager:
|
|
1578 |
links_set.add(full_url)
|
1579 |
except Exception:
|
1580 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1581 |
|
1582 |
def resolve_relative_url(self, relative_url, base_url, path_base):
|
1583 |
"""Properly resolve relative URLs considering multiple formats"""
|
@@ -1628,12 +2657,14 @@ class DownloadManager:
|
|
1628 |
total_links = len(sublinks)
|
1629 |
progress_text.text(f"Found {total_links} sublinks to process")
|
1630 |
|
|
|
|
|
|
|
1631 |
if not sublinks:
|
1632 |
progress_bar.progress(1.0)
|
1633 |
-
return
|
1634 |
|
1635 |
# Process each sublink
|
1636 |
-
all_files = main_files
|
1637 |
for i, sublink in enumerate(sublinks, 1):
|
1638 |
progress = i / total_links
|
1639 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
@@ -1703,6 +2734,7 @@ def main():
|
|
1703 |
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
|
1704 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
1705 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
|
|
1706 |
|
1707 |
with st.expander("Google Drive Integration", expanded=False):
|
1708 |
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
@@ -1713,6 +2745,37 @@ def main():
|
|
1713 |
creds, msg = exchange_code_for_credentials(auth_code)
|
1714 |
st.session_state.google_creds = creds
|
1715 |
st.write(msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1716 |
|
1717 |
if mode == "Manual URL":
|
1718 |
st.header("Manual URL Mode")
|
@@ -1727,16 +2790,20 @@ def main():
|
|
1727 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
1728 |
|
1729 |
@st.cache_resource
|
1730 |
-
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
|
1731 |
async def _run():
|
1732 |
-
async with DownloadManager(
|
|
|
|
|
|
|
|
|
1733 |
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
1734 |
return files
|
1735 |
return asyncio.run(_run())
|
1736 |
|
1737 |
with st.spinner("Searching for files..."):
|
1738 |
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
1739 |
-
|
1740 |
|
1741 |
if files:
|
1742 |
st.session_state.discovered_files = files
|
@@ -1799,7 +2866,11 @@ def main():
|
|
1799 |
progress_bar = st.progress(0)
|
1800 |
status_text = st.empty()
|
1801 |
|
1802 |
-
async with DownloadManager(
|
|
|
|
|
|
|
|
|
1803 |
for i, idx in enumerate(selected_indices):
|
1804 |
progress = (i + 1) / len(selected_indices)
|
1805 |
file_info = files[idx]
|
@@ -1880,7 +2951,13 @@ def main():
|
|
1880 |
if st.button("Search", key="search_btn"):
|
1881 |
if query:
|
1882 |
async def run_search():
|
1883 |
-
async with DownloadManager(
|
|
|
|
|
|
|
|
|
|
|
|
|
1884 |
with st.spinner("Searching..."):
|
1885 |
urls = await dm.search_bing()
|
1886 |
if urls:
|
@@ -1911,16 +2988,20 @@ def main():
|
|
1911 |
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
1912 |
|
1913 |
@st.cache_resource
|
1914 |
-
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
|
1915 |
async def _run():
|
1916 |
-
async with DownloadManager(
|
|
|
|
|
|
|
|
|
1917 |
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
1918 |
return files
|
1919 |
return asyncio.run(_run())
|
1920 |
|
1921 |
with st.spinner("Searching for files..."):
|
1922 |
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
1923 |
-
sublink_timeout, use_proxy, proxy)
|
1924 |
|
1925 |
if files:
|
1926 |
st.session_state.discovered_files = files
|
@@ -1944,7 +3025,7 @@ def main():
|
|
1944 |
|
1945 |
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
1946 |
async def download_viewonly():
|
1947 |
-
async with DownloadManager() as dm:
|
1948 |
file_info = {
|
1949 |
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
1950 |
'filename': f"gdrive_{file_id}.pdf",
|
@@ -1957,13 +3038,15 @@ def main():
|
|
1957 |
|
1958 |
if result:
|
1959 |
st.success("Document downloaded successfully!")
|
|
|
|
|
1960 |
with open(result, "rb") as f:
|
1961 |
file_bytes = f.read()
|
1962 |
-
|
1963 |
st.download_button(
|
1964 |
label="Download PDF",
|
1965 |
data=file_bytes,
|
1966 |
-
file_name=
|
1967 |
mime="application/pdf"
|
1968 |
)
|
1969 |
else:
|
@@ -1971,7 +3054,7 @@ def main():
|
|
1971 |
|
1972 |
# Add footer with attribution
|
1973 |
st.markdown('---')
|
1974 |
-
st.markdown('Created by [Euler314](https://github.com/
|
1975 |
|
1976 |
if __name__ == "__main__":
|
1977 |
main()
|
|
|
22 |
import traceback
|
23 |
import base64
|
24 |
import shutil
|
25 |
+
import json
|
26 |
+
import time
|
27 |
from PIL import Image
|
28 |
from reportlab.lib.pagesizes import letter
|
29 |
from reportlab.pdfgen import canvas
|
|
|
52 |
}
|
53 |
}
|
54 |
|
55 |
+
# -------------------- Stealth and UA Settings --------------------
|
56 |
+
# Extended user agent list for better variety
|
57 |
+
USER_AGENTS = [
|
58 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
59 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
60 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
61 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
62 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
|
63 |
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
64 |
+
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
|
65 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
66 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
|
67 |
+
]
|
68 |
+
|
69 |
+
# Stealth browser settings
|
70 |
+
STEALTH_SETTINGS = {
|
71 |
+
# Hardware features to modify/disable
|
72 |
+
"hardware_concurrency": 4,
|
73 |
+
"device_memory": 8,
|
74 |
+
# Browser features to enable/disable
|
75 |
+
"webgl_vendor": "Google Inc. (Intel)",
|
76 |
+
"webgl_renderer": "Intel Iris OpenGL Engine",
|
77 |
+
"languages": ["en-US", "en"],
|
78 |
+
"disable_webrtc": True,
|
79 |
+
# Additional timing randomization
|
80 |
+
"navigator_platform": "Win32",
|
81 |
+
"touch_support": False
|
82 |
+
}
|
83 |
+
|
84 |
+
# Proxy rotation configuration (if using multiple proxies)
|
85 |
+
PROXY_ROTATION_CONFIG = {
|
86 |
+
"enabled": False, # Set to True to enable rotation
|
87 |
+
"rotation_interval": 10, # Rotate every 10 requests
|
88 |
+
"proxies": [] # Will be populated from the UI if needed
|
89 |
+
}
|
90 |
+
|
91 |
# -------------------- Utility Functions --------------------
|
92 |
def get_random_user_agent():
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return random.choice(USER_AGENTS)
|
94 |
|
95 |
def sizeof_fmt(num, suffix='B'):
|
|
|
107 |
zipf.write(file_path, os.path.basename(file_path))
|
108 |
return zip_path
|
109 |
|
110 |
+
def get_file_extension(url, default='.pdf'):
|
111 |
+
"""Extract file extension from URL or filename"""
|
112 |
+
path = urlparse(url).path
|
113 |
+
ext = os.path.splitext(path)[1].lower()
|
114 |
+
if not ext:
|
115 |
+
return default
|
116 |
+
return ext
|
117 |
+
|
118 |
+
def humanize_file_size(size_bytes):
|
119 |
+
"""Format file size in human-readable format"""
|
120 |
+
if size_bytes < 1024:
|
121 |
+
return f"{size_bytes} bytes"
|
122 |
+
for unit in ['KB', 'MB', 'GB', 'TB']:
|
123 |
+
size_bytes /= 1024.0
|
124 |
+
if size_bytes < 1024.0:
|
125 |
+
return f"{size_bytes:.1f} {unit}"
|
126 |
+
return f"{size_bytes:.1f} PB"
|
127 |
+
|
128 |
+
def get_domain(url):
|
129 |
+
"""Extract domain from URL"""
|
130 |
+
parsed = urlparse(url)
|
131 |
+
return parsed.netloc
|
132 |
+
|
133 |
+
def is_valid_file_url(url, extensions):
|
134 |
+
"""Check if URL is a valid file URL based on extension"""
|
135 |
+
return any(url.lower().endswith(ext) for ext in extensions)
|
136 |
+
|
137 |
+
def detect_captcha(html_content):
|
138 |
+
"""Detect common captcha patterns in HTML content"""
|
139 |
+
captcha_patterns = [
|
140 |
+
'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
|
141 |
+
'challenge', 'solve the following', 'verify you are human'
|
142 |
+
]
|
143 |
+
html_lower = html_content.lower()
|
144 |
+
return any(pattern in html_lower for pattern in captcha_patterns)
|
145 |
+
|
146 |
# -------------------- Google Drive Functions --------------------
|
147 |
def get_google_auth_url():
|
148 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
|
|
221 |
|
222 |
# -------------------- Download Manager Class --------------------
|
223 |
class DownloadManager:
|
224 |
+
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False):
|
225 |
self.use_proxy = use_proxy
|
226 |
self.proxy = proxy
|
227 |
self.query = query
|
|
|
230 |
self.browser = None
|
231 |
self.context = None
|
232 |
self.page = None
|
233 |
+
self.use_stealth = use_stealth
|
234 |
+
self.proxy_rotation = proxy_rotation
|
235 |
+
self.request_count = 0
|
236 |
+
self.captcha_detected = False
|
237 |
+
self.download_timeout = 300 # 5 minutes timeout for downloads
|
238 |
|
239 |
async def __aenter__(self):
|
240 |
self.playwright = await async_playwright().start()
|
241 |
+
|
242 |
+
# Prepare browser args with stealth settings
|
243 |
+
browser_args = [
|
244 |
+
'--no-sandbox',
|
245 |
+
'--disable-setuid-sandbox',
|
246 |
+
'--disable-dev-shm-usage',
|
247 |
+
'--disable-gpu',
|
248 |
+
'--no-zygote',
|
249 |
+
'--single-process',
|
250 |
+
'--disable-web-security',
|
251 |
+
'--disable-features=IsolateOrigins',
|
252 |
+
'--disable-site-isolation-trials'
|
253 |
+
]
|
254 |
+
|
255 |
+
# Add stealth-specific args
|
256 |
+
if self.use_stealth:
|
257 |
+
browser_args.extend([
|
258 |
+
'--disable-blink-features=AutomationControlled',
|
259 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
260 |
+
'--disable-webgl',
|
261 |
+
'--disable-webrtc'
|
262 |
+
])
|
263 |
+
|
264 |
+
# Setup browser options
|
265 |
opts = {
|
266 |
"headless": True,
|
267 |
+
"args": browser_args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
}
|
269 |
+
|
270 |
+
# Configure proxy if specified
|
271 |
if self.use_proxy and self.proxy:
|
272 |
opts["proxy"] = {"server": self.proxy}
|
273 |
+
|
274 |
+
# Launch browser with options
|
275 |
self.browser = await self.playwright.chromium.launch(**opts)
|
276 |
+
|
277 |
+
# Setup browser context with enhanced settings
|
278 |
+
context_opts = {
|
279 |
+
"user_agent": get_random_user_agent(),
|
280 |
+
"viewport": {"width": 1920, "height": 1080},
|
281 |
+
"device_scale_factor": 1,
|
282 |
+
"has_touch": False,
|
283 |
+
"is_mobile": False,
|
284 |
+
"ignore_https_errors": True,
|
285 |
+
"accept_downloads": True
|
286 |
+
}
|
287 |
+
|
288 |
+
# Apply stealth-specific settings to the context
|
289 |
+
if self.use_stealth:
|
290 |
+
# Apply JS-injection for enhanced stealth
|
291 |
+
context_opts["bypass_csp"] = True
|
292 |
+
self.context = await self.browser.new_context(**context_opts)
|
293 |
+
|
294 |
+
# Execute stealth JS to avoid detection
|
295 |
+
await self.context.add_init_script("""
|
296 |
+
() => {
|
297 |
+
Object.defineProperty(navigator, 'webdriver', {
|
298 |
+
get: () => false,
|
299 |
+
});
|
300 |
+
|
301 |
+
// Change navigator properties
|
302 |
+
const newProto = navigator.__proto__;
|
303 |
+
delete newProto.webdriver;
|
304 |
+
|
305 |
+
// Overwrite the plugins
|
306 |
+
Object.defineProperty(navigator, 'plugins', {
|
307 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({
|
308 |
+
lengthComputable: true,
|
309 |
+
loaded: 100,
|
310 |
+
total: 100
|
311 |
+
}))
|
312 |
+
});
|
313 |
+
|
314 |
+
// Handle languages more naturally
|
315 |
+
Object.defineProperty(navigator, 'languages', {
|
316 |
+
get: () => ['en-US', 'en', 'es']
|
317 |
+
});
|
318 |
+
|
319 |
+
// Modify hardware concurrency
|
320 |
+
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
321 |
+
get: () => 4
|
322 |
+
});
|
323 |
+
|
324 |
+
// Modify deviceMemory
|
325 |
+
Object.defineProperty(navigator, 'deviceMemory', {
|
326 |
+
get: () => 8
|
327 |
+
});
|
328 |
+
|
329 |
+
// WebGL modifications
|
330 |
+
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
331 |
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
332 |
+
if (parameter === 37445) {
|
333 |
+
return 'Intel Inc.';
|
334 |
+
}
|
335 |
+
if (parameter === 37446) {
|
336 |
+
return 'Intel Iris OpenGL Engine';
|
337 |
+
}
|
338 |
+
return getParameter.apply(this, arguments);
|
339 |
+
};
|
340 |
+
}
|
341 |
+
""")
|
342 |
+
else:
|
343 |
+
# Regular context without stealth
|
344 |
+
self.context = await self.browser.new_context(**context_opts)
|
345 |
+
|
346 |
+
# Create page with enhanced headers
|
347 |
self.page = await self.context.new_page()
|
348 |
await self.page.set_extra_http_headers({
|
349 |
+
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
|
350 |
'Accept-Encoding': 'gzip, deflate, br',
|
351 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
352 |
+
'Cache-Control': 'max-age=0',
|
353 |
+
'DNT': '1', # Do Not Track
|
354 |
+
'Referer': 'https://www.google.com/',
|
355 |
+
'Sec-Fetch-Dest': 'document',
|
356 |
+
'Sec-Fetch-Mode': 'navigate',
|
357 |
+
'Sec-Fetch-Site': 'cross-site',
|
358 |
+
'Sec-Fetch-User': '?1',
|
359 |
+
'Upgrade-Insecure-Requests': '1'
|
360 |
})
|
361 |
+
|
362 |
+
# Add delay for mouse movements to simulate human behavior
|
363 |
+
if self.use_stealth:
|
364 |
+
await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500))
|
365 |
+
await self.page.wait_for_timeout(random.randint(200, 500))
|
366 |
+
|
367 |
return self
|
368 |
|
369 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
|
372 |
if self.playwright:
|
373 |
await self.playwright.stop()
|
374 |
|
375 |
+
async def rotate_proxy_if_needed(self):
|
376 |
+
"""Rotate proxy if proxy rotation is enabled and threshold is reached"""
|
377 |
+
if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]:
|
378 |
+
self.request_count += 1
|
379 |
+
if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]:
|
380 |
+
# Get next proxy from the pool
|
381 |
+
next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0)
|
382 |
+
PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list
|
383 |
+
|
384 |
+
# Close existing context and create new one with the new proxy
|
385 |
+
if self.context:
|
386 |
+
await self.context.close()
|
387 |
+
|
388 |
+
# Create new context with the new proxy
|
389 |
+
context_opts = {
|
390 |
+
"user_agent": get_random_user_agent(),
|
391 |
+
"proxy": {"server": next_proxy},
|
392 |
+
"accept_downloads": True
|
393 |
+
}
|
394 |
+
self.context = await self.browser.new_context(**context_opts)
|
395 |
+
self.page = await self.context.new_page()
|
396 |
+
|
397 |
+
# Reset counter
|
398 |
+
self.request_count = 0
|
399 |
+
logger.info(f"Rotated to new proxy: {next_proxy}")
|
400 |
+
|
401 |
+
async def handle_captcha(self, page):
|
402 |
+
"""Detect and handle captchas if possible"""
|
403 |
+
# Check for common captcha patterns
|
404 |
+
content = await page.content()
|
405 |
+
if detect_captcha(content):
|
406 |
+
self.captcha_detected = True
|
407 |
+
logger.warning("Captcha detected on page")
|
408 |
+
|
409 |
+
# Strategies for handling captchas:
|
410 |
+
# 1. For simple captchas, try to extract the image and solve it
|
411 |
+
captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]')
|
412 |
+
if captcha_img:
|
413 |
+
logger.info("Found captcha image, attempting to capture")
|
414 |
+
|
415 |
+
# Take screenshot of the captcha
|
416 |
+
captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png")
|
417 |
+
await captcha_img.screenshot(path=captcha_path)
|
418 |
+
|
419 |
+
# In a real implementation, you would send this to a captcha solving service
|
420 |
+
# For now, just log the detection
|
421 |
+
logger.info(f"Captcha image saved to {captcha_path}")
|
422 |
+
|
423 |
+
# For demonstration, we'll notify the user but not actually solve it
|
424 |
+
return False
|
425 |
+
|
426 |
+
# 2. For reCAPTCHA, special handling would be required
|
427 |
+
recaptcha = await page.query_selector('iframe[src*="recaptcha"]')
|
428 |
+
if recaptcha:
|
429 |
+
logger.warning("reCAPTCHA detected, would require external solving service")
|
430 |
+
return False
|
431 |
+
|
432 |
+
# 3. Try to perform human-like actions that might bypass simple bot checks
|
433 |
+
await self.perform_human_actions(page)
|
434 |
+
|
435 |
+
# Check if captcha is still present
|
436 |
+
content = await page.content()
|
437 |
+
if detect_captcha(content):
|
438 |
+
logger.warning("Captcha still present after human-like actions")
|
439 |
+
return False
|
440 |
+
else:
|
441 |
+
logger.info("Captcha appears to be resolved")
|
442 |
+
return True
|
443 |
+
|
444 |
+
return True # No captcha detected
|
445 |
+
|
446 |
+
async def perform_human_actions(self, page):
|
447 |
+
"""Perform human-like actions on the page to possibly bypass simple bot checks"""
|
448 |
+
try:
|
449 |
+
# 1. Slowly scroll down the page
|
450 |
+
for i in range(3):
|
451 |
+
await page.evaluate(f"window.scrollTo(0, {i * 300})")
|
452 |
+
await page.wait_for_timeout(random.randint(300, 700))
|
453 |
+
|
454 |
+
# 2. Random mouse movements
|
455 |
+
for _ in range(3):
|
456 |
+
x = random.randint(100, 800)
|
457 |
+
y = random.randint(100, 600)
|
458 |
+
await page.mouse.move(x=x, y=y)
|
459 |
+
await page.wait_for_timeout(random.randint(200, 500))
|
460 |
+
|
461 |
+
# 3. Click on a non-essential part of the page
|
462 |
+
try:
|
463 |
+
await page.click("body", position={"x": 50, "y": 50})
|
464 |
+
except:
|
465 |
+
pass
|
466 |
+
|
467 |
+
# 4. Wait a bit before continuing
|
468 |
+
await page.wait_for_timeout(1000)
|
469 |
+
|
470 |
+
except Exception as e:
|
471 |
+
logger.warning(f"Error during human-like actions: {e}")
|
472 |
+
|
473 |
async def search_bing(self):
|
474 |
urls = []
|
475 |
try:
|
476 |
+
# Rotate proxy if needed
|
477 |
+
await self.rotate_proxy_if_needed()
|
478 |
+
|
479 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
480 |
await self.page.goto(search_url, timeout=30000)
|
481 |
await self.page.wait_for_load_state('networkidle')
|
482 |
+
|
483 |
+
# Check for captchas
|
484 |
+
if not await self.handle_captcha(self.page):
|
485 |
+
logger.warning("Captcha detected during search, results may be limited")
|
486 |
+
|
487 |
+
# More natural scrolling behavior
|
488 |
+
for i in range(3):
|
489 |
+
await self.page.evaluate(f"window.scrollTo(0, {i * 400})")
|
490 |
+
await self.page.wait_for_timeout(random.randint(300, 800))
|
491 |
+
|
492 |
+
# Extract search results
|
493 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
494 |
for link in links[:self.num_results]:
|
495 |
href = await link.get_attribute('href')
|
496 |
if href:
|
497 |
urls.append(href)
|
498 |
+
|
499 |
+
# If we didn't find enough results, try an alternative selector
|
500 |
+
if len(urls) < self.num_results:
|
501 |
+
alt_links = await self.page.query_selector_all(".b_caption a")
|
502 |
+
for link in alt_links:
|
503 |
+
href = await link.get_attribute('href')
|
504 |
+
if href and href not in urls:
|
505 |
+
urls.append(href)
|
506 |
+
if len(urls) >= self.num_results:
|
507 |
+
break
|
508 |
+
|
509 |
return urls
|
510 |
except Exception as e:
|
511 |
logger.error(f"Error searching Bing: {e}")
|
|
|
513 |
|
514 |
async def get_file_size(self, url):
|
515 |
try:
|
516 |
+
await self.rotate_proxy_if_needed()
|
517 |
+
|
518 |
async with self.context.new_page() as page:
|
519 |
response = await page.request.head(url, timeout=15000)
|
520 |
length = response.headers.get('Content-Length', None)
|
|
|
522 |
return sizeof_fmt(int(length))
|
523 |
else:
|
524 |
return "Unknown Size"
|
525 |
+
except Exception as e:
|
526 |
+
logger.warning(f"Error getting file size: {e}")
|
527 |
return "Unknown Size"
|
528 |
|
529 |
async def get_pdf_metadata(self, url):
|
530 |
try:
|
531 |
+
await self.rotate_proxy_if_needed()
|
532 |
+
|
533 |
async with self.context.new_page() as page:
|
534 |
resp = await page.request.get(url, timeout=15000)
|
535 |
if resp.ok:
|
|
|
543 |
}
|
544 |
else:
|
545 |
return {}
|
546 |
+
except Exception as e:
|
547 |
+
logger.warning(f"Error reading PDF metadata: {e}")
|
548 |
return {}
|
549 |
|
550 |
async def extract_real_download_url(self, url):
|
551 |
try:
|
552 |
+
await self.rotate_proxy_if_needed()
|
553 |
+
|
554 |
async with self.context.new_page() as page:
|
555 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
556 |
if response and response.headers.get('location'):
|
|
|
567 |
logger.info(f"Fetching exam links from {url}")
|
568 |
links = set()
|
569 |
|
570 |
+
# First try with direct requests for speed (but with proper headers)
|
571 |
+
headers = {
|
572 |
+
"User-Agent": get_random_user_agent(),
|
573 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
574 |
+
"Accept-Language": "en-US,en;q=0.9",
|
575 |
+
"Referer": "https://www.google.com/",
|
576 |
+
"DNT": "1"
|
577 |
+
}
|
578 |
+
|
579 |
try:
|
580 |
response = requests.get(url, headers=headers, timeout=30)
|
581 |
|
|
|
590 |
href = a["href"]
|
591 |
full_url = urljoin(url, href)
|
592 |
|
593 |
+
# Look for text clues
|
594 |
+
link_text = a.get_text().lower()
|
595 |
+
|
596 |
+
# Special patterns for exam sites (expanded list)
|
597 |
+
url_patterns = [
|
598 |
+
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
599 |
+
"/test/", "/download/", "/files/", "/assignments/",
|
600 |
+
"paper_", "question_", "exam_", "test_", "past_",
|
601 |
+
"assignment_", "sample_", "study_material", "notes_",
|
602 |
+
"/resource/", "/subject/", "/course/", "/material/"
|
603 |
+
]
|
604 |
+
|
605 |
+
text_patterns = [
|
606 |
+
"exam", "paper", "test", "question", "past", "download",
|
607 |
+
"assignment", "sample", "study", "material", "notes",
|
608 |
+
"subject", "course", "resource", "pdf", "document",
|
609 |
+
"view", "open", "get", "solution", "answer"
|
610 |
+
]
|
611 |
+
|
612 |
+
# Check URL for patterns
|
613 |
+
if any(pattern in full_url.lower() for pattern in url_patterns):
|
614 |
+
links.add(full_url)
|
615 |
+
continue
|
616 |
+
|
617 |
+
# Check link text for patterns
|
618 |
+
if any(pattern in link_text for pattern in text_patterns):
|
619 |
+
links.add(full_url)
|
620 |
+
continue
|
621 |
+
|
622 |
+
# Check for common file extensions
|
623 |
+
if any(full_url.lower().endswith(ext) for ext in
|
624 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
625 |
+
links.add(full_url)
|
626 |
except Exception as e:
|
627 |
logger.warning(f"Request-based extraction failed: {e}")
|
628 |
|
629 |
+
# Browser-based approach for more thorough extraction or if initial approach was inadequate
|
630 |
+
try:
|
631 |
+
# Check if we need to proceed with browser-based extraction
|
632 |
+
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
633 |
+
logger.info("Using browser for enhanced link extraction")
|
634 |
+
|
635 |
+
# Rotate proxy if needed
|
636 |
+
await self.rotate_proxy_if_needed()
|
637 |
|
638 |
+
# Navigate to the page with more natural timing
|
639 |
+
await self.page.goto(url, timeout=45000, wait_until='networkidle')
|
640 |
+
await self.page.wait_for_timeout(random.randint(1000, 2000))
|
641 |
+
|
642 |
+
# Handle captchas if present
|
643 |
+
if not await self.handle_captcha(self.page):
|
644 |
+
logger.warning("Captcha detected, extraction may be limited")
|
645 |
+
|
646 |
+
# Get base URL for resolving relative links
|
647 |
+
parsed_base = urlparse(url)
|
648 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
649 |
+
|
650 |
+
# Perform natural scrolling to trigger lazy-loaded content
|
651 |
+
page_height = await self.page.evaluate("document.body.scrollHeight")
|
652 |
+
viewport_height = await self.page.evaluate("window.innerHeight")
|
653 |
+
|
654 |
+
for scroll_pos in range(0, page_height, viewport_height // 2):
|
655 |
+
await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})")
|
656 |
+
await self.page.wait_for_timeout(random.randint(300, 800))
|
657 |
+
|
658 |
+
# Scroll back to top
|
659 |
+
await self.page.evaluate("window.scrollTo(0, 0)")
|
660 |
+
await self.page.wait_for_timeout(500)
|
661 |
+
|
662 |
+
# Extract all links with Playwright (better than just anchor tags)
|
663 |
+
all_links = await self.page.evaluate("""
|
664 |
() => {
|
665 |
+
const results = [];
|
666 |
+
|
667 |
+
// Get all anchor tags
|
668 |
const anchors = document.querySelectorAll('a[href]');
|
669 |
for (const a of anchors) {
|
670 |
if (a.href) {
|
671 |
+
results.push({
|
672 |
href: a.href,
|
673 |
+
text: a.innerText || a.textContent || '',
|
674 |
+
isButton: a.classList.contains('btn') || a.role === 'button'
|
675 |
});
|
676 |
}
|
677 |
}
|
678 |
+
|
679 |
+
// Get buttons that might contain links
|
680 |
+
const buttons = document.querySelectorAll('button');
|
681 |
+
for (const btn of buttons) {
|
682 |
+
const onclick = btn.getAttribute('onclick') || '';
|
683 |
+
if (onclick.includes('window.location') || onclick.includes('download')) {
|
684 |
+
results.push({
|
685 |
+
href: '#button',
|
686 |
+
text: btn.innerText || btn.textContent || '',
|
687 |
+
isButton: true,
|
688 |
+
onclick: onclick
|
689 |
+
});
|
690 |
+
}
|
691 |
+
}
|
692 |
+
|
693 |
+
return results;
|
694 |
}
|
695 |
""")
|
696 |
|
697 |
+
# Process the extracted links
|
698 |
+
for link_info in all_links:
|
699 |
href = link_info.get('href', '')
|
700 |
text = link_info.get('text', '').lower()
|
701 |
|
702 |
+
if href and href != '#button':
|
703 |
+
# Check URL patterns
|
704 |
+
url_patterns = [
|
705 |
+
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
706 |
+
"/test/", "/download/", "/files/", "/assignments/",
|
707 |
+
"paper_", "question_", "exam_", "test_", "past_",
|
708 |
+
"assignment_", "sample_", "study_material", "notes_"
|
709 |
+
]
|
710 |
|
711 |
+
# Check text patterns
|
712 |
+
text_patterns = [
|
713 |
+
"exam", "paper", "test", "question", "past", "download",
|
714 |
+
"assignment", "sample", "study", "material", "notes",
|
715 |
+
"pdf", "document", "view", "open", "solution"
|
716 |
+
]
|
717 |
|
718 |
if any(pattern in href.lower() for pattern in url_patterns) or \
|
719 |
+
any(pattern in text for pattern in text_patterns) or \
|
720 |
+
any(href.lower().endswith(ext) for ext in
|
721 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
722 |
links.add(href)
|
723 |
|
724 |
# Check for ASP.NET specific elements that might contain exam links
|
725 |
+
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
|
726 |
for grid in grid_elements:
|
727 |
grid_links = await grid.query_selector_all('a[href]')
|
728 |
for a in grid_links:
|
729 |
href = await a.get_attribute('href')
|
730 |
+
text = await a.text_content()
|
731 |
+
|
732 |
if href:
|
733 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
734 |
links.add(full_url)
|
735 |
|
736 |
+
# Try clicking pagination controls to reveal more content
|
737 |
+
pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a')
|
738 |
+
for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons
|
739 |
+
try:
|
740 |
+
# Check if this is a numeric pagination button (more likely to be useful)
|
741 |
+
button_text = await button.text_content()
|
742 |
+
if button_text and button_text.strip().isdigit():
|
743 |
+
logger.info(f"Clicking pagination button: {button_text}")
|
744 |
+
await button.click()
|
745 |
+
await self.page.wait_for_timeout(2000)
|
746 |
+
await self.page.wait_for_load_state('networkidle', timeout=10000)
|
747 |
+
|
748 |
+
# Extract links from this page
|
749 |
+
new_page_links = await self.page.evaluate("""
|
750 |
+
() => {
|
751 |
+
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
|
752 |
+
}
|
753 |
+
""")
|
754 |
+
|
755 |
+
for href in new_page_links:
|
756 |
+
if href and not href.startswith('javascript:'):
|
757 |
+
if any(pattern in href.lower() for pattern in url_patterns) or \
|
758 |
+
any(href.lower().endswith(ext) for ext in
|
759 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
760 |
+
links.add(href)
|
761 |
+
except Exception as e:
|
762 |
+
logger.warning(f"Error clicking pagination button: {e}")
|
763 |
+
|
764 |
+
# Try clicking any controls that might reveal more exam links (more focused approach)
|
765 |
+
show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn')
|
766 |
+
for button in show_buttons:
|
767 |
+
button_text = (await button.text_content() or "").lower()
|
768 |
+
button_value = (await button.get_attribute("value") or "").lower()
|
769 |
+
button_id = (await button.get_attribute("id") or "").lower()
|
770 |
+
|
771 |
+
# Look for buttons that seem likely to reveal file lists
|
772 |
+
promising_terms = ["show", "view", "display", "list", "exam", "paper", "test",
|
773 |
+
"download", "resource", "material", "browse", "file"]
|
774 |
+
|
775 |
+
if any(term in button_text or term in button_value or term in button_id
|
776 |
+
for term in promising_terms):
|
777 |
try:
|
778 |
+
logger.info(f"Clicking button: {button_text or button_value}")
|
779 |
await button.click()
|
780 |
+
await self.page.wait_for_timeout(2000)
|
781 |
+
await self.page.wait_for_load_state('networkidle', timeout=10000)
|
782 |
|
783 |
# Get any new links that appeared
|
784 |
new_links = await self.page.query_selector_all('a[href]')
|
|
|
786 |
href = await a.get_attribute('href')
|
787 |
if href:
|
788 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
789 |
+
|
790 |
+
# Focus on file extensions and patterns
|
791 |
+
if any(full_url.lower().endswith(ext) for ext in
|
792 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \
|
793 |
+
any(pattern in full_url.lower() for pattern in url_patterns):
|
794 |
+
links.add(full_url)
|
795 |
except Exception as e:
|
796 |
logger.warning(f"Error clicking button: {e}")
|
797 |
+
|
798 |
+
# Special handling for ASP.NET PostBack links
|
799 |
+
try:
|
800 |
+
# Find and interact with ASP.NET __doPostBack elements
|
801 |
+
postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]')
|
802 |
+
for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks
|
803 |
+
try:
|
804 |
+
onclick = await element.get_attribute('onclick')
|
805 |
+
if onclick and '__doPostBack' in onclick:
|
806 |
+
element_text = await element.text_content()
|
807 |
+
|
808 |
+
# Only interact with elements that seem likely to contain exam links
|
809 |
+
promising_terms = ["show", "view", "list", "exam", "paper", "test",
|
810 |
+
"download", "resource", "material"]
|
811 |
+
|
812 |
+
if any(term in element_text.lower() for term in promising_terms):
|
813 |
+
logger.info(f"Clicking ASP.NET postback element: {element_text}")
|
814 |
+
|
815 |
+
# Click the element
|
816 |
+
await element.click()
|
817 |
+
await self.page.wait_for_timeout(2000)
|
818 |
+
await self.page.wait_for_load_state('networkidle', timeout=10000)
|
819 |
+
|
820 |
+
# Extract any new links
|
821 |
+
new_links = await self.page.query_selector_all('a[href]')
|
822 |
+
for a in new_links:
|
823 |
+
href = await a.get_attribute('href')
|
824 |
+
if href:
|
825 |
+
full_url = href if href.startswith('http') else urljoin(url, href)
|
826 |
+
if any(full_url.lower().endswith(ext) for ext in
|
827 |
+
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
828 |
+
links.add(full_url)
|
829 |
+
except Exception as e:
|
830 |
+
logger.warning(f"Error interacting with postback element: {e}")
|
831 |
except Exception as e:
|
832 |
+
logger.warning(f"Error during postback handling: {e}")
|
833 |
+
|
834 |
+
except Exception as e:
|
835 |
+
logger.error(f"Browser-based extraction failed: {e}")
|
836 |
|
837 |
# Filter links to likely contain exam documents
|
838 |
filtered_links = []
|
839 |
for link in links:
|
840 |
# Common file extensions for exam documents
|
841 |
+
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
842 |
filtered_links.append(link)
|
843 |
continue
|
844 |
|
845 |
# Common paths for exam documents
|
846 |
if any(pattern in link.lower() for pattern in [
|
847 |
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
|
848 |
+
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
|
849 |
+
"/resource/", "/material/", "/notes/", "/subjectmaterial/"
|
850 |
]):
|
851 |
filtered_links.append(link)
|
852 |
|
|
|
860 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
861 |
found_files = []
|
862 |
try:
|
863 |
+
# Rotate proxy if needed
|
864 |
+
await self.rotate_proxy_if_needed()
|
865 |
+
|
866 |
# Special handling for educational exam sites
|
867 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
868 |
["exam", "test", "pastpaper", "eduexp"]):
|
|
|
883 |
except Exception:
|
884 |
pass
|
885 |
|
886 |
+
# If filename is empty or invalid, create a sensible one
|
887 |
+
if not filename or filename == '/':
|
888 |
+
domain = get_domain(real_url)
|
889 |
+
ext = get_file_extension(real_url, '.pdf')
|
890 |
+
filename = f"file_from_{domain}{ext}"
|
891 |
+
|
892 |
# Get file size
|
893 |
size_str = await self.get_file_size(real_url)
|
894 |
|
|
|
915 |
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
916 |
if not response:
|
917 |
return []
|
918 |
+
|
919 |
+
# Check for captchas
|
920 |
+
if not await self.handle_captcha(self.page):
|
921 |
+
logger.warning("Captcha detected, file extraction may be limited")
|
922 |
+
|
923 |
+
# Scroll through the page naturally to trigger lazy loading
|
924 |
+
await self.page.evaluate("""
|
925 |
+
(async () => {
|
926 |
+
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
927 |
+
const height = document.body.scrollHeight;
|
928 |
+
const scrollStep = Math.floor(window.innerHeight / 2);
|
929 |
+
|
930 |
+
for (let i = 0; i < height; i += scrollStep) {
|
931 |
+
window.scrollTo(0, i);
|
932 |
+
await delay(100);
|
933 |
+
}
|
934 |
+
|
935 |
+
window.scrollTo(0, 0);
|
936 |
+
})()
|
937 |
+
""")
|
938 |
+
await self.page.wait_for_timeout(1000)
|
939 |
|
940 |
final_url = self.page.url
|
941 |
if '.php' in final_url or 'download' in final_url:
|
942 |
real_url = await self.extract_real_download_url(final_url)
|
943 |
if real_url != final_url:
|
944 |
+
# Try to detect the filename from headers or URL
|
945 |
+
response = await self.page.request.head(real_url, timeout=15000)
|
946 |
+
filename = None
|
947 |
+
|
948 |
+
# Try to get from Content-Disposition header
|
949 |
+
content_disposition = response.headers.get('Content-Disposition', '')
|
950 |
+
if 'filename=' in content_disposition:
|
951 |
+
filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition)
|
952 |
+
if filename_match:
|
953 |
+
filename = filename_match.group(1)
|
954 |
+
|
955 |
+
# If not found in headers, use URL basename
|
956 |
+
if not filename:
|
957 |
+
filename = os.path.basename(urlparse(real_url).path)
|
958 |
+
if not filename or filename == '/':
|
959 |
+
# Generate a name based on domain
|
960 |
+
domain = get_domain(real_url)
|
961 |
+
ext = get_file_extension(real_url, '.pdf')
|
962 |
+
filename = f"file_from_{domain}{ext}"
|
963 |
+
|
964 |
found_files.append({
|
965 |
'url': real_url,
|
966 |
+
'filename': filename,
|
967 |
'size': await self.get_file_size(real_url),
|
968 |
'metadata': {}
|
969 |
})
|
|
|
1076 |
'metadata': meta
|
1077 |
})
|
1078 |
|
1079 |
+
# Also check for data-src and data-url attributes (common in lazy-loaded sites)
|
1080 |
+
data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]')
|
1081 |
+
for elem in data_elements:
|
1082 |
+
for attr in ['data-src', 'data-url', 'data-href', 'data-download']:
|
1083 |
+
try:
|
1084 |
+
value = await elem.get_attribute(attr)
|
1085 |
+
if value and any(value.lower().endswith(ext) for ext in all_exts):
|
1086 |
+
file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
|
1087 |
+
found_files.append({
|
1088 |
+
'url': file_url,
|
1089 |
+
'filename': os.path.basename(file_url.split('?')[0]),
|
1090 |
+
'size': await self.get_file_size(file_url),
|
1091 |
+
'metadata': {}
|
1092 |
+
})
|
1093 |
+
except:
|
1094 |
+
pass
|
1095 |
+
|
1096 |
+
# Check script tags for JSON data that might contain file URLs
|
1097 |
+
script_elements = soup.find_all('script', type='application/json')
|
1098 |
+
for script in script_elements:
|
1099 |
+
try:
|
1100 |
+
json_data = json.loads(script.string)
|
1101 |
+
# Look for URL patterns in the JSON data
|
1102 |
+
def extract_urls_from_json(obj, urls_found=None):
|
1103 |
+
if urls_found is None:
|
1104 |
+
urls_found = []
|
1105 |
+
if isinstance(obj, dict):
|
1106 |
+
for k, v in obj.items():
|
1107 |
+
# Check if any key contains url-like terms
|
1108 |
+
url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download']
|
1109 |
+
if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'):
|
1110 |
+
urls_found.append(v)
|
1111 |
+
else:
|
1112 |
+
extract_urls_from_json(v, urls_found)
|
1113 |
+
elif isinstance(obj, list):
|
1114 |
+
for item in obj:
|
1115 |
+
extract_urls_from_json(item, urls_found)
|
1116 |
+
return urls_found
|
1117 |
+
|
1118 |
+
json_urls = extract_urls_from_json(json_data)
|
1119 |
+
for json_url in json_urls:
|
1120 |
+
if any(json_url.lower().endswith(ext) for ext in all_exts):
|
1121 |
+
found_files.append({
|
1122 |
+
'url': json_url,
|
1123 |
+
'filename': os.path.basename(json_url.split('?')[0]),
|
1124 |
+
'size': await self.get_file_size(json_url),
|
1125 |
+
'metadata': {}
|
1126 |
+
})
|
1127 |
+
except:
|
1128 |
+
pass
|
1129 |
+
|
1130 |
+
# Check for hidden download buttons or forms
|
1131 |
+
hidden_elements = await self.page.evaluate("""
|
1132 |
+
() => {
|
1133 |
+
const results = [];
|
1134 |
+
|
1135 |
+
// Check for hidden forms with download actions
|
1136 |
+
const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]');
|
1137 |
+
for (const form of forms) {
|
1138 |
+
const action = form.getAttribute('action') || '';
|
1139 |
+
results.push({
|
1140 |
+
type: 'form',
|
1141 |
+
action: action,
|
1142 |
+
inputs: Array.from(form.querySelectorAll('input[name]')).map(input => {
|
1143 |
+
return {name: input.name, value: input.value};
|
1144 |
+
})
|
1145 |
+
});
|
1146 |
+
}
|
1147 |
+
|
1148 |
+
// Check for hidden download links/buttons
|
1149 |
+
const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => {
|
1150 |
+
const style = window.getComputedStyle(a);
|
1151 |
+
return (style.display === 'none' || style.visibility === 'hidden') &&
|
1152 |
+
(a.href.includes('download') || a.href.includes('file'));
|
1153 |
+
});
|
1154 |
+
|
1155 |
+
for (const link of hiddenLinks) {
|
1156 |
+
results.push({
|
1157 |
+
type: 'link',
|
1158 |
+
href: link.href,
|
1159 |
+
text: link.innerText || link.textContent
|
1160 |
+
});
|
1161 |
+
}
|
1162 |
+
|
1163 |
+
return results;
|
1164 |
+
}
|
1165 |
+
""")
|
1166 |
+
|
1167 |
+
# Process hidden elements
|
1168 |
+
for elem in hidden_elements:
|
1169 |
+
if elem['type'] == 'link' and 'href' in elem:
|
1170 |
+
href = elem['href']
|
1171 |
+
if any(href.lower().endswith(ext) for ext in all_exts):
|
1172 |
+
found_files.append({
|
1173 |
+
'url': href,
|
1174 |
+
'filename': os.path.basename(href.split('?')[0]),
|
1175 |
+
'size': await self.get_file_size(href),
|
1176 |
+
'metadata': {}
|
1177 |
+
})
|
1178 |
+
|
1179 |
+
# Deduplicate files by URL
|
1180 |
seen_urls = set()
|
1181 |
unique_files = []
|
1182 |
for f in found_files:
|
1183 |
if f['url'] not in seen_urls:
|
1184 |
seen_urls.add(f['url'])
|
1185 |
unique_files.append(f)
|
1186 |
+
|
1187 |
return unique_files
|
1188 |
except Exception as e:
|
1189 |
logger.error(f"Error extracting files from {url}: {e}")
|
1190 |
+
traceback.print_exc()
|
1191 |
return []
|
1192 |
|
1193 |
async def download_file(self, file_info, save_dir, referer):
|
|
|
1226 |
logger.warning("All standard methods failed, attempting force download")
|
1227 |
result_path = await self.force_download_viewonly(file_info, path)
|
1228 |
return result_path if result_path else None
|
1229 |
+
|
1230 |
+
# Rotate proxy if needed
|
1231 |
+
await self.rotate_proxy_if_needed()
|
1232 |
+
|
1233 |
+
# Try with direct requests first (faster)
|
1234 |
+
try:
|
1235 |
+
headers = {
|
1236 |
+
'User-Agent': get_random_user_agent(),
|
1237 |
+
'Accept': '*/*',
|
1238 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
1239 |
+
'Referer': referer,
|
1240 |
+
'DNT': '1'
|
1241 |
+
}
|
1242 |
+
|
1243 |
+
with requests.get(file_url, headers=headers, stream=True, timeout=30) as response:
|
1244 |
+
if response.status_code == 200:
|
1245 |
+
# Check content type to verify it's not HTML/error page
|
1246 |
+
content_type = response.headers.get('Content-Type', '')
|
1247 |
+
if 'text/html' in content_type and not file_url.endswith('.html'):
|
1248 |
+
logger.warning(f"Received HTML instead of expected file: {file_url}")
|
1249 |
+
else:
|
1250 |
+
with open(path, 'wb') as f:
|
1251 |
+
for chunk in response.iter_content(chunk_size=8192):
|
1252 |
+
if chunk:
|
1253 |
+
f.write(chunk)
|
1254 |
+
|
1255 |
+
# Verify file was downloaded correctly
|
1256 |
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
1257 |
+
return path
|
1258 |
+
except Exception as e:
|
1259 |
+
logger.warning(f"Direct download failed: {e}, trying browser approach")
|
1260 |
|
1261 |
+
# Original code for non-Google Drive downloads using Playwright
|
1262 |
async with self.context.new_page() as page:
|
1263 |
headers = {
|
1264 |
'Accept': '*/*',
|
1265 |
'Accept-Encoding': 'gzip, deflate, br',
|
1266 |
'Referer': referer
|
1267 |
}
|
1268 |
+
|
1269 |
+
# Try to download with timeout protection
|
1270 |
+
try:
|
1271 |
+
response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000)
|
1272 |
+
if response.status == 200:
|
1273 |
+
content = await response.body()
|
1274 |
+
with open(path, 'wb') as f:
|
1275 |
+
f.write(content)
|
1276 |
+
return path
|
1277 |
+
else:
|
1278 |
+
logger.error(f"Download failed with status {response.status}: {file_url}")
|
1279 |
+
|
1280 |
+
# Try to extract error information
|
1281 |
+
error_info = await response.text()
|
1282 |
+
logger.debug(f"Error response: {error_info[:200]}...")
|
1283 |
+
|
1284 |
+
# Check if this might be a captcha or login issue
|
1285 |
+
if detect_captcha(error_info):
|
1286 |
+
logger.warning("Captcha detected during download")
|
1287 |
+
# For HF Spaces, we can't implement browser-based captcha solving here
|
1288 |
+
# Just log the issue for now
|
1289 |
+
except PlaywrightTimeoutError:
|
1290 |
+
logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}")
|
1291 |
+
|
1292 |
+
# Try an alternative approach - using the browser's download manager
|
1293 |
+
try:
|
1294 |
+
logger.info("Trying browser download manager approach")
|
1295 |
+
download_promise = page.wait_for_event("download")
|
1296 |
+
await page.goto(file_url, timeout=60000)
|
1297 |
+
|
1298 |
+
# Wait for download to start (with timeout)
|
1299 |
+
download = await download_promise
|
1300 |
+
await download.save_as(path)
|
1301 |
+
|
1302 |
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
1303 |
+
return path
|
1304 |
+
except Exception as e:
|
1305 |
+
logger.error(f"Browser download manager approach failed: {e}")
|
1306 |
+
|
1307 |
+
return None
|
1308 |
except Exception as e:
|
1309 |
logger.error(f"Error downloading {file_url}: {e}")
|
1310 |
return None
|
|
|
1334 |
|
1335 |
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
|
1336 |
|
1337 |
+
# Create a dedicated browser instance with better resolution and stealth
|
1338 |
+
browser_args = [
|
1339 |
+
'--no-sandbox',
|
1340 |
+
'--disable-setuid-sandbox',
|
1341 |
+
'--disable-dev-shm-usage',
|
1342 |
+
'--disable-web-security',
|
1343 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
1344 |
+
'--disable-site-isolation-trials',
|
1345 |
+
'--disable-blink-features=AutomationControlled' # Anti-detection
|
1346 |
+
]
|
1347 |
+
|
1348 |
browser = await self.playwright.chromium.launch(
|
1349 |
headless=True,
|
1350 |
+
args=browser_args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1351 |
)
|
1352 |
|
1353 |
# Use higher resolution for better quality
|
|
|
1358 |
accept_downloads=True # Critical for the download workflow
|
1359 |
)
|
1360 |
|
1361 |
+
# Add anti-detection script
|
1362 |
+
await context.add_init_script("""
|
1363 |
+
() => {
|
1364 |
+
Object.defineProperty(navigator, 'webdriver', {
|
1365 |
+
get: () => false,
|
1366 |
+
});
|
1367 |
+
|
1368 |
+
// Change plugins
|
1369 |
+
Object.defineProperty(navigator, 'plugins', {
|
1370 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({
|
1371 |
+
lengthComputable: true,
|
1372 |
+
loaded: 100,
|
1373 |
+
total: 100
|
1374 |
+
}))
|
1375 |
+
});
|
1376 |
+
|
1377 |
+
// Handle languages
|
1378 |
+
Object.defineProperty(navigator, 'languages', {
|
1379 |
+
get: () => ['en-US', 'en', 'es']
|
1380 |
+
});
|
1381 |
+
|
1382 |
+
// Modify hardware concurrency
|
1383 |
+
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
1384 |
+
get: () => 4
|
1385 |
+
});
|
1386 |
+
}
|
1387 |
+
""")
|
1388 |
+
|
1389 |
page = await context.new_page()
|
1390 |
|
1391 |
try:
|
|
|
1393 |
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
|
1394 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
|
1395 |
await page.wait_for_load_state('networkidle')
|
1396 |
+
|
1397 |
+
# Check for any barriers or permissions issues
|
1398 |
+
content = await page.content()
|
1399 |
+
if "the owner has not granted you permission to" in content:
|
1400 |
+
logger.warning("Permission denied error detected")
|
1401 |
+
|
1402 |
+
# Randomized wait to appear more human-like
|
1403 |
+
await page.wait_for_timeout(random.randint(3000, 7000))
|
1404 |
|
1405 |
# Create temp directory
|
1406 |
temp_dir = tempfile.mkdtemp()
|
|
|
1409 |
if file_type.lower() == 'pdf':
|
1410 |
# Use the improved scrolling and detection approach
|
1411 |
|
1412 |
+
# Perform some natural mouse movements and scrolling
|
1413 |
+
await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400))
|
1414 |
+
await page.wait_for_timeout(random.randint(500, 1000))
|
1415 |
+
|
1416 |
+
# Estimate number of pages
|
1417 |
estimated_pages = await page.evaluate("""
|
1418 |
() => {
|
1419 |
// Method 1: Check page counter text
|
|
|
1443 |
|
1444 |
logger.info(f"Estimated {estimated_pages} pages in PDF")
|
1445 |
|
1446 |
+
# Initial scroll to trigger lazy loading
|
1447 |
+
logger.info("Initial scroll to bottom to trigger lazy loading...")
|
|
|
|
|
1448 |
await page.keyboard.press("End")
|
1449 |
await page.wait_for_timeout(3000)
|
1450 |
|
1451 |
# Scroll page by page to ensure all pages are loaded
|
1452 |
+
logger.info("Scrolling page by page...")
|
1453 |
max_attempts = min(estimated_pages * 3, 300)
|
1454 |
attempt = 0
|
1455 |
prev_blob_count = 0
|
|
|
1467 |
logger.info("All pages appear to be loaded.")
|
1468 |
break
|
1469 |
|
1470 |
+
# Alternate between PageDown and End keys for more natural scrolling
|
1471 |
+
if attempt % 3 == 0:
|
1472 |
+
await page.keyboard.press("End")
|
1473 |
+
else:
|
1474 |
+
await page.keyboard.press("PageDown")
|
1475 |
+
|
1476 |
+
# Randomized wait times
|
1477 |
+
await page.wait_for_timeout(random.randint(1500, 3000))
|
1478 |
+
|
1479 |
+
# Move mouse randomly to appear more human-like
|
1480 |
+
if attempt % 4 == 0:
|
1481 |
+
await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800))
|
1482 |
+
|
1483 |
prev_blob_count = blob_count
|
1484 |
attempt += 1
|
1485 |
|
|
|
1545 |
|
1546 |
if not result.get('success', False):
|
1547 |
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
|
1548 |
+
|
1549 |
+
# Try fallback approach - screenshot method
|
1550 |
+
logger.info("Trying fallback screenshot method...")
|
1551 |
+
|
1552 |
+
# Navigate back to the first page
|
1553 |
+
await page.evaluate("""
|
1554 |
+
() => {
|
1555 |
+
// Find and click the "first page" button if available
|
1556 |
+
const buttons = Array.from(document.querySelectorAll('button'));
|
1557 |
+
const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page'));
|
1558 |
+
if (firstPageBtn) firstPageBtn.click();
|
1559 |
+
}
|
1560 |
+
""")
|
1561 |
+
await page.wait_for_timeout(1000);
|
1562 |
+
|
1563 |
+
# Create a PDF by taking screenshots of each page
|
1564 |
+
screenshots = []
|
1565 |
+
current_page = 1
|
1566 |
+
max_pages = estimated_pages
|
1567 |
+
|
1568 |
+
# Create a PDF using the reportlab package
|
1569 |
+
while current_page <= max_pages:
|
1570 |
+
screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png")
|
1571 |
+
|
1572 |
+
# Try to find the current page element
|
1573 |
+
page_elem = await page.query_selector('.drive-viewer-paginated-page')
|
1574 |
+
if page_elem:
|
1575 |
+
await page_elem.screenshot(path=screenshot_path)
|
1576 |
+
else:
|
1577 |
+
# Fallback to full page screenshot
|
1578 |
+
await page.screenshot(path=screenshot_path)
|
1579 |
+
|
1580 |
+
screenshots.append(screenshot_path)
|
1581 |
+
|
1582 |
+
# Try to navigate to next page
|
1583 |
+
next_btn = await page.query_selector('button[aria-label="Next page"]')
|
1584 |
+
if next_btn:
|
1585 |
+
is_disabled = await next_btn.get_attribute('disabled')
|
1586 |
+
if is_disabled:
|
1587 |
+
logger.info(f"Reached end of document at page {current_page}")
|
1588 |
+
break
|
1589 |
+
|
1590 |
+
await next_btn.click()
|
1591 |
+
await page.wait_for_timeout(1000)
|
1592 |
+
current_page += 1
|
1593 |
+
else:
|
1594 |
+
break
|
1595 |
+
|
1596 |
+
# Create PDF from screenshots
|
1597 |
+
if screenshots:
|
1598 |
+
first_img = Image.open(screenshots[0])
|
1599 |
+
width, height = first_img.size
|
1600 |
+
|
1601 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
1602 |
+
for screenshot in screenshots:
|
1603 |
+
img = Image.open(screenshot)
|
1604 |
+
c.drawImage(screenshot, 0, 0, width, height)
|
1605 |
+
c.showPage()
|
1606 |
+
c.save()
|
1607 |
+
|
1608 |
+
# Clean up screenshots
|
1609 |
+
for screenshot in screenshots:
|
1610 |
+
os.remove(screenshot)
|
1611 |
+
|
1612 |
+
return save_path
|
1613 |
+
|
1614 |
return None
|
1615 |
|
1616 |
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
|
|
|
1712 |
|
1713 |
# Try standard approaches for non-view-only files
|
1714 |
try:
|
1715 |
+
# Try direct download link first (fastest)
|
1716 |
+
direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
|
1717 |
+
|
1718 |
+
# Add anti-bot headers
|
1719 |
+
headers = {
|
1720 |
+
'User-Agent': get_random_user_agent(),
|
1721 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
1722 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
1723 |
+
'Referer': 'https://drive.google.com/',
|
1724 |
+
'DNT': '1'
|
1725 |
+
}
|
1726 |
+
|
1727 |
+
# Try with streaming to handle larger files
|
1728 |
+
with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r:
|
1729 |
+
if r.status_code == 200:
|
1730 |
+
# Check if we got HTML instead of the file
|
1731 |
+
content_type = r.headers.get('Content-Type', '')
|
1732 |
+
if 'text/html' in content_type and not file_id.endswith('.html'):
|
1733 |
+
logger.warning("Received HTML instead of file, trying with session cookies")
|
1734 |
+
else:
|
1735 |
+
# Looks like we got the actual file
|
1736 |
+
with open(save_path, 'wb') as f:
|
1737 |
+
for chunk in r.iter_content(chunk_size=8192):
|
1738 |
+
if chunk:
|
1739 |
+
f.write(chunk)
|
1740 |
+
|
1741 |
+
# Verify file exists and has content
|
1742 |
+
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
1743 |
+
logger.info("Direct download successful")
|
1744 |
+
return True
|
1745 |
+
|
1746 |
# Try with requests and session cookies
|
1747 |
session = requests.Session()
|
1748 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
|
|
1785 |
except Exception as e:
|
1786 |
logger.warning(f"Requests session download failed: {e}")
|
1787 |
|
1788 |
+
# Try browser-based approach as last resort
|
1789 |
+
try:
|
1790 |
+
async with self.context.new_page() as page:
|
1791 |
+
# Visit the file view page first to get cookies
|
1792 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
|
1793 |
+
await page.wait_for_timeout(3000)
|
1794 |
+
|
1795 |
+
# Set up download event listener
|
1796 |
+
download_promise = page.wait_for_event("download")
|
1797 |
+
|
1798 |
+
# Try to trigger the download button click
|
1799 |
+
download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]')
|
1800 |
+
if download_button:
|
1801 |
+
await download_button.click()
|
1802 |
+
|
1803 |
+
# Wait for download to start
|
1804 |
+
try:
|
1805 |
+
download = await download_promise
|
1806 |
+
await download.save_as(save_path)
|
1807 |
+
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
|
1808 |
+
except Exception as e:
|
1809 |
+
logger.error(f"Error during browser download: {e}")
|
1810 |
+
return False
|
1811 |
+
else:
|
1812 |
+
# Try the export download URL
|
1813 |
+
await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000)
|
1814 |
+
|
1815 |
+
# Look for and click any download buttons or links
|
1816 |
+
download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")')
|
1817 |
+
for elem in download_elements:
|
1818 |
+
try:
|
1819 |
+
await elem.click()
|
1820 |
+
# Wait a bit to see if download starts
|
1821 |
+
try:
|
1822 |
+
download = await download_promise
|
1823 |
+
await download.save_as(save_path)
|
1824 |
+
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
|
1825 |
+
except:
|
1826 |
+
pass
|
1827 |
+
except:
|
1828 |
+
continue
|
1829 |
+
except Exception as e:
|
1830 |
+
logger.error(f"Browser-based download attempt failed: {e}")
|
1831 |
+
|
1832 |
+
logger.warning("All standard download methods failed")
|
1833 |
return False
|
1834 |
|
1835 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
1836 |
"""Download view-only PDF using the enhanced blob image caching technique"""
|
1837 |
try:
|
1838 |
+
# Create a dedicated browser instance with stealth capabilities
|
1839 |
+
browser_args = [
|
1840 |
+
'--no-sandbox',
|
1841 |
+
'--disable-setuid-sandbox',
|
1842 |
+
'--disable-dev-shm-usage',
|
1843 |
+
'--disable-web-security',
|
1844 |
+
'--disable-blink-features=AutomationControlled' # Anti-detection
|
1845 |
+
]
|
1846 |
+
|
1847 |
browser = await self.playwright.chromium.launch(
|
1848 |
headless=True,
|
1849 |
+
args=browser_args
|
|
|
|
|
|
|
|
|
|
|
1850 |
)
|
1851 |
|
1852 |
+
# Setup stealth context
|
1853 |
context = await browser.new_context(
|
1854 |
viewport={'width': 1600, 'height': 1200},
|
1855 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
1856 |
+
accept_downloads=True, # Critical for handling the download event
|
1857 |
+
ignore_https_errors=True
|
1858 |
)
|
1859 |
|
1860 |
+
# Add stealth script
|
1861 |
+
await context.add_init_script("""
|
1862 |
+
() => {
|
1863 |
+
Object.defineProperty(navigator, 'webdriver', {
|
1864 |
+
get: () => false,
|
1865 |
+
});
|
1866 |
+
|
1867 |
+
// Change plugins and languages to appear more human
|
1868 |
+
Object.defineProperty(navigator, 'plugins', {
|
1869 |
+
get: () => [1, 2, 3, 4, 5].map(() => ({
|
1870 |
+
lengthComputable: true,
|
1871 |
+
loaded: 100,
|
1872 |
+
total: 100
|
1873 |
+
}))
|
1874 |
+
});
|
1875 |
+
|
1876 |
+
Object.defineProperty(navigator, 'languages', {
|
1877 |
+
get: () => ['en-US', 'en', 'es']
|
1878 |
+
});
|
1879 |
+
}
|
1880 |
+
""")
|
1881 |
+
|
1882 |
page = await context.new_page()
|
1883 |
|
1884 |
try:
|
1885 |
+
# Step 1: Navigate to the file with human-like behavior
|
1886 |
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
|
1887 |
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
|
1888 |
await page.wait_for_load_state('networkidle')
|
1889 |
+
|
1890 |
+
# Perform human-like interactions
|
1891 |
+
await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300))
|
1892 |
+
await page.wait_for_timeout(random.randint(2000, 5000))
|
1893 |
|
1894 |
# Step 2: Estimate the number of pages
|
1895 |
estimated_pages = await page.evaluate("""
|
|
|
1922 |
await page.keyboard.press("End")
|
1923 |
await page.wait_for_timeout(3000)
|
1924 |
|
1925 |
+
# Step 4: Wait for all pages to load with better feedback and randomization
|
1926 |
+
logger.info("Scrolling through document to load all pages...")
|
1927 |
+
max_attempts = min(estimated_pages * 3, 300)
|
1928 |
attempt = 0
|
1929 |
prev_blob_count = 0
|
1930 |
+
consecutive_same_count = 0
|
1931 |
|
1932 |
while attempt < max_attempts:
|
1933 |
# Count blob images (which are the PDF pages)
|
|
|
1939 |
|
1940 |
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
|
1941 |
|
1942 |
+
# Check if we've loaded all pages or if we're stuck
|
1943 |
+
if blob_count >= estimated_pages:
|
1944 |
+
logger.info(f"All {estimated_pages} pages appear to be loaded.")
|
1945 |
break
|
1946 |
|
1947 |
+
if blob_count == prev_blob_count:
|
1948 |
+
consecutive_same_count += 1
|
1949 |
+
if consecutive_same_count >= 5 and blob_count > 0:
|
1950 |
+
logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.")
|
1951 |
+
break
|
1952 |
+
else:
|
1953 |
+
consecutive_same_count = 0
|
1954 |
+
|
1955 |
+
# Mix up the scrolling approach for more human-like behavior
|
1956 |
+
scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"])
|
1957 |
+
|
1958 |
+
if scroll_action == "PageDown":
|
1959 |
+
await page.keyboard.press("PageDown")
|
1960 |
+
elif scroll_action == "End":
|
1961 |
+
await page.keyboard.press("End")
|
1962 |
+
elif scroll_action == "ArrowDown":
|
1963 |
+
# Press arrow down multiple times
|
1964 |
+
for _ in range(random.randint(5, 15)):
|
1965 |
+
await page.keyboard.press("ArrowDown")
|
1966 |
+
await page.wait_for_timeout(random.randint(50, 150))
|
1967 |
+
else: # mouse
|
1968 |
+
# Scroll using mouse wheel
|
1969 |
+
current_y = random.randint(300, 700)
|
1970 |
+
await page.mouse.move(x=random.randint(300, 800), y=current_y)
|
1971 |
+
await page.mouse.wheel(0, random.randint(300, 800))
|
1972 |
+
|
1973 |
+
# Random wait between scrolls
|
1974 |
+
await page.wait_for_timeout(random.randint(1000, 3000))
|
1975 |
+
|
1976 |
prev_blob_count = blob_count
|
1977 |
attempt += 1
|
1978 |
|
|
|
1992 |
try {
|
1993 |
let pdf = new jsPDF();
|
1994 |
let imgs = document.getElementsByTagName("img");
|
|
|
|
|
|
|
1995 |
let validImages = [];
|
1996 |
+
|
1997 |
+
// First collect all valid blob images
|
1998 |
for (let i = 0; i < imgs.length; i++) {
|
1999 |
let img = imgs[i];
|
2000 |
if (!/^blob:/.test(img.src)) continue;
|
|
|
2002 |
validImages.push(img);
|
2003 |
}
|
2004 |
|
2005 |
+
// Sort by position in the document
|
2006 |
validImages.sort((a, b) => {
|
2007 |
const rectA = a.getBoundingClientRect();
|
2008 |
const rectB = b.getBoundingClientRect();
|
|
|
2011 |
|
2012 |
console.log(`Found ${validImages.length} valid page images to add to PDF`);
|
2013 |
|
2014 |
+
let added = 0;
|
2015 |
// Process each image as a page
|
2016 |
for (let i = 0; i < validImages.length; i++) {
|
2017 |
let img = validImages[i];
|
|
|
2326 |
logger.info(f"Found {len(links)} sublinks with specialized method")
|
2327 |
return list(links)[:limit]
|
2328 |
|
2329 |
+
# Rotate proxy if needed
|
2330 |
+
await self.rotate_proxy_if_needed()
|
2331 |
+
|
2332 |
# Standard sublink extraction for all sites
|
2333 |
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
2334 |
|
|
|
2337 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
2338 |
path_base = os.path.dirname(parsed_base.path)
|
2339 |
|
2340 |
+
# Perform initial scrolling to load lazy content
|
2341 |
+
await self.page.evaluate("""
|
2342 |
+
async () => {
|
2343 |
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
2344 |
+
const height = document.body.scrollHeight;
|
2345 |
+
const step = Math.floor(window.innerHeight / 2);
|
2346 |
+
|
2347 |
+
for (let i = 0; i < height; i += step) {
|
2348 |
+
window.scrollTo(0, i);
|
2349 |
+
await delay(150);
|
2350 |
+
}
|
2351 |
+
|
2352 |
+
window.scrollTo(0, 0);
|
2353 |
+
}
|
2354 |
+
""")
|
2355 |
+
await self.page.wait_for_timeout(1000)
|
2356 |
+
|
2357 |
# Check if page has ASP.NET elements which might need special handling
|
2358 |
is_aspnet = await self.page.evaluate('''
|
2359 |
() => {
|
|
|
2476 |
except Exception as e:
|
2477 |
logger.warning(f"Error with postback: {e}")
|
2478 |
|
2479 |
+
# Look for pagination controls and try to navigate through them
|
2480 |
+
pagination_elements = await self.page.query_selector_all(
|
2481 |
+
'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]'
|
2482 |
+
)
|
2483 |
+
|
2484 |
+
# Try clicking on pagination links (limit to max 5 pages to avoid infinite loops)
|
2485 |
+
for i in range(min(5, len(pagination_elements))):
|
2486 |
+
try:
|
2487 |
+
# Focus on elements that look like "next page" buttons
|
2488 |
+
el = pagination_elements[i]
|
2489 |
+
el_text = await el.text_content() or ""
|
2490 |
+
|
2491 |
+
# Only click if this looks like a pagination control
|
2492 |
+
if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip():
|
2493 |
+
logger.info(f"Clicking pagination control: {el_text}")
|
2494 |
+
await el.click()
|
2495 |
+
await self.page.wait_for_timeout(2000)
|
2496 |
+
await self.page.wait_for_load_state('networkidle', timeout=5000)
|
2497 |
+
|
2498 |
+
# Get new links from this page
|
2499 |
+
await self.extract_all_link_types(links, base_url, path_base)
|
2500 |
+
except Exception as e:
|
2501 |
+
logger.warning(f"Error clicking pagination: {e}")
|
2502 |
+
|
2503 |
+
# Check for hidden links that might be revealed by JavaScript
|
2504 |
+
hidden_links = await self.page.evaluate("""
|
2505 |
+
() => {
|
2506 |
+
// Try to execute common JavaScript patterns that reveal hidden content
|
2507 |
+
try {
|
2508 |
+
// Common patterns used in websites to initially hide content
|
2509 |
+
const hiddenContainers = document.querySelectorAll(
|
2510 |
+
'.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]'
|
2511 |
+
);
|
2512 |
+
|
2513 |
+
// Attempt to make them visible
|
2514 |
+
hiddenContainers.forEach(el => {
|
2515 |
+
el.style.display = 'block';
|
2516 |
+
el.style.visibility = 'visible';
|
2517 |
+
el.classList.remove('hidden', 'hide');
|
2518 |
+
});
|
2519 |
+
|
2520 |
+
// Return any newly visible links
|
2521 |
+
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
|
2522 |
+
} catch (e) {
|
2523 |
+
return [];
|
2524 |
+
}
|
2525 |
+
}
|
2526 |
+
""")
|
2527 |
+
|
2528 |
+
# Add any newly discovered links
|
2529 |
+
for href in hidden_links:
|
2530 |
+
if href and not href.startswith('javascript:'):
|
2531 |
+
links.add(href)
|
2532 |
+
|
2533 |
logger.info(f"Found {len(links)} sublinks")
|
2534 |
return list(links)[:limit]
|
2535 |
|
|
|
2594 |
links_set.add(full_url)
|
2595 |
except Exception:
|
2596 |
pass
|
2597 |
+
|
2598 |
+
# Extract links from JSON data embedded in the page
|
2599 |
+
script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]')
|
2600 |
+
for script in script_elements:
|
2601 |
+
try:
|
2602 |
+
script_content = await script.text_content()
|
2603 |
+
if script_content:
|
2604 |
+
# Look for URLs in the JSON content
|
2605 |
+
urls = re.findall(r'(https?://[^\'"]+)', script_content)
|
2606 |
+
for url in urls:
|
2607 |
+
links_set.add(url)
|
2608 |
+
except Exception:
|
2609 |
+
pass
|
2610 |
|
2611 |
def resolve_relative_url(self, relative_url, base_url, path_base):
|
2612 |
"""Properly resolve relative URLs considering multiple formats"""
|
|
|
2657 |
total_links = len(sublinks)
|
2658 |
progress_text.text(f"Found {total_links} sublinks to process")
|
2659 |
|
2660 |
+
# Always include files from the main page, regardless of sublinks
|
2661 |
+
all_files = main_files
|
2662 |
+
|
2663 |
if not sublinks:
|
2664 |
progress_bar.progress(1.0)
|
2665 |
+
return all_files
|
2666 |
|
2667 |
# Process each sublink
|
|
|
2668 |
for i, sublink in enumerate(sublinks, 1):
|
2669 |
progress = i / total_links
|
2670 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
|
|
2734 |
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
|
2735 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
2736 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
2737 |
+
use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox")
|
2738 |
|
2739 |
with st.expander("Google Drive Integration", expanded=False):
|
2740 |
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
|
|
2745 |
creds, msg = exchange_code_for_credentials(auth_code)
|
2746 |
st.session_state.google_creds = creds
|
2747 |
st.write(msg)
|
2748 |
+
|
2749 |
+
with st.expander("Advanced Browser Settings", expanded=False):
|
2750 |
+
# Captcha handling options
|
2751 |
+
st.write("**Captcha Handling**")
|
2752 |
+
captcha_option = st.radio(
|
2753 |
+
"Captcha Detection:",
|
2754 |
+
["Auto-detect only", "Manual solve (shows captcha)"],
|
2755 |
+
index=0,
|
2756 |
+
key="captcha_option"
|
2757 |
+
)
|
2758 |
+
|
2759 |
+
# Proxy rotation settings
|
2760 |
+
st.write("**Proxy Rotation**")
|
2761 |
+
enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation")
|
2762 |
+
if enable_rotation:
|
2763 |
+
PROXY_ROTATION_CONFIG["enabled"] = True
|
2764 |
+
proxy_list = st.text_area(
|
2765 |
+
"Proxy List (one per line)",
|
2766 |
+
placeholder="http://proxy1:port\nhttp://proxy2:port",
|
2767 |
+
key="proxy_list"
|
2768 |
+
)
|
2769 |
+
if proxy_list:
|
2770 |
+
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()]
|
2771 |
+
rotation_interval = st.slider(
|
2772 |
+
"Rotation Interval (# of requests)",
|
2773 |
+
min_value=1,
|
2774 |
+
max_value=50,
|
2775 |
+
value=10,
|
2776 |
+
key="rotation_interval"
|
2777 |
+
)
|
2778 |
+
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
|
2779 |
|
2780 |
if mode == "Manual URL":
|
2781 |
st.header("Manual URL Mode")
|
|
|
2790 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
2791 |
|
2792 |
@st.cache_resource
|
2793 |
+
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
|
2794 |
async def _run():
|
2795 |
+
async with DownloadManager(
|
2796 |
+
use_proxy=use_proxy_val,
|
2797 |
+
proxy=proxy_val,
|
2798 |
+
use_stealth=use_stealth_val
|
2799 |
+
) as dm:
|
2800 |
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
2801 |
return files
|
2802 |
return asyncio.run(_run())
|
2803 |
|
2804 |
with st.spinner("Searching for files..."):
|
2805 |
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
2806 |
+
sublink_timeout, use_proxy, proxy, use_stealth)
|
2807 |
|
2808 |
if files:
|
2809 |
st.session_state.discovered_files = files
|
|
|
2866 |
progress_bar = st.progress(0)
|
2867 |
status_text = st.empty()
|
2868 |
|
2869 |
+
async with DownloadManager(
|
2870 |
+
use_proxy=use_proxy,
|
2871 |
+
proxy=proxy,
|
2872 |
+
use_stealth=use_stealth
|
2873 |
+
) as dm:
|
2874 |
for i, idx in enumerate(selected_indices):
|
2875 |
progress = (i + 1) / len(selected_indices)
|
2876 |
file_info = files[idx]
|
|
|
2951 |
if st.button("Search", key="search_btn"):
|
2952 |
if query:
|
2953 |
async def run_search():
|
2954 |
+
async with DownloadManager(
|
2955 |
+
use_proxy=use_proxy,
|
2956 |
+
proxy=proxy,
|
2957 |
+
query=query,
|
2958 |
+
num_results=num_results,
|
2959 |
+
use_stealth=use_stealth
|
2960 |
+
) as dm:
|
2961 |
with st.spinner("Searching..."):
|
2962 |
urls = await dm.search_bing()
|
2963 |
if urls:
|
|
|
2988 |
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
2989 |
|
2990 |
@st.cache_resource
|
2991 |
+
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
|
2992 |
async def _run():
|
2993 |
+
async with DownloadManager(
|
2994 |
+
use_proxy=use_proxy_val,
|
2995 |
+
proxy=proxy_val,
|
2996 |
+
use_stealth=use_stealth_val
|
2997 |
+
) as dm:
|
2998 |
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
2999 |
return files
|
3000 |
return asyncio.run(_run())
|
3001 |
|
3002 |
with st.spinner("Searching for files..."):
|
3003 |
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
3004 |
+
sublink_timeout, use_proxy, proxy, use_stealth)
|
3005 |
|
3006 |
if files:
|
3007 |
st.session_state.discovered_files = files
|
|
|
3025 |
|
3026 |
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
3027 |
async def download_viewonly():
|
3028 |
+
async with DownloadManager(use_stealth=use_stealth) as dm:
|
3029 |
file_info = {
|
3030 |
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
3031 |
'filename': f"gdrive_{file_id}.pdf",
|
|
|
3038 |
|
3039 |
if result:
|
3040 |
st.success("Document downloaded successfully!")
|
3041 |
+
|
3042 |
+
# Provide download button
|
3043 |
with open(result, "rb") as f:
|
3044 |
file_bytes = f.read()
|
3045 |
+
|
3046 |
st.download_button(
|
3047 |
label="Download PDF",
|
3048 |
data=file_bytes,
|
3049 |
+
file_name=f"gdrive_{file_id}.pdf",
|
3050 |
mime="application/pdf"
|
3051 |
)
|
3052 |
else:
|
|
|
3054 |
|
3055 |
# Add footer with attribution
|
3056 |
st.markdown('---')
|
3057 |
+
st.markdown('Created by [Euler314](https://github.com/euler314)')
|
3058 |
|
3059 |
if __name__ == "__main__":
|
3060 |
main()
|