Update app.py
Browse files
app.py
CHANGED
@@ -217,7 +217,6 @@ class DownloadManager:
|
|
217 |
self.browser = None
|
218 |
self.context = None
|
219 |
self.page = None
|
220 |
-
self.base_domains = set() # Store base domains and their variations
|
221 |
|
222 |
async def __aenter__(self):
|
223 |
self.playwright = await async_playwright().start()
|
@@ -251,63 +250,6 @@ class DownloadManager:
|
|
251 |
if self.playwright:
|
252 |
await self.playwright.stop()
|
253 |
|
254 |
-
def get_base_domain(self, url):
|
255 |
-
"""Extract base domain and add variations to self.base_domains"""
|
256 |
-
parsed = urlparse(url)
|
257 |
-
domain = parsed.netloc.split(':')[0] # Remove port if present
|
258 |
-
|
259 |
-
# Add the main domain and possible variations
|
260 |
-
base_parts = domain.split('.')
|
261 |
-
if len(base_parts) > 2:
|
262 |
-
main_domain = '.'.join(base_parts[-2:])
|
263 |
-
self.base_domains.add(main_domain)
|
264 |
-
# Add variations like files.domain.com for domain.com
|
265 |
-
self.base_domains.add(domain)
|
266 |
-
|
267 |
-
# Handle www and non-www versions
|
268 |
-
if base_parts[0] == 'www':
|
269 |
-
self.base_domains.add('.'.join(base_parts[1:]))
|
270 |
-
else:
|
271 |
-
self.base_domains.add(f"www.{domain}")
|
272 |
-
else:
|
273 |
-
self.base_domains.add(domain)
|
274 |
-
|
275 |
-
return domain
|
276 |
-
|
277 |
-
def is_related_domain(self, url):
|
278 |
-
"""Check if URL belongs to any of the known domain variations"""
|
279 |
-
parsed = urlparse(url)
|
280 |
-
domain = parsed.netloc.split(':')[0]
|
281 |
-
|
282 |
-
# Check if this domain or any of its parts match our base domains
|
283 |
-
parts = domain.split('.')
|
284 |
-
for i in range(len(parts) - 1):
|
285 |
-
check_domain = '.'.join(parts[i:])
|
286 |
-
if check_domain in self.base_domains:
|
287 |
-
return True
|
288 |
-
return False
|
289 |
-
|
290 |
-
async def get_real_url(self, url):
|
291 |
-
"""Follow redirects and get the final URL"""
|
292 |
-
try:
|
293 |
-
async with self.context.new_page() as page:
|
294 |
-
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
295 |
-
final_url = page.url
|
296 |
-
|
297 |
-
# Check for meta refresh redirects
|
298 |
-
content = await page.content()
|
299 |
-
soup = BeautifulSoup(content, 'html.parser')
|
300 |
-
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
|
301 |
-
if meta_refresh:
|
302 |
-
content = meta_refresh.get('content', '')
|
303 |
-
if 'url=' in content.lower():
|
304 |
-
final_url = content.split('url=')[-1].strip("'").strip('"')
|
305 |
-
|
306 |
-
return final_url, response.headers if response else {}
|
307 |
-
except Exception as e:
|
308 |
-
logger.error(f"Error getting real URL for {url}: {e}")
|
309 |
-
return url, {}
|
310 |
-
|
311 |
async def get_file_size(self, url):
|
312 |
try:
|
313 |
async with self.context.new_page() as page:
|
@@ -338,32 +280,40 @@ class DownloadManager:
|
|
338 |
except Exception:
|
339 |
return {}
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
342 |
found_files = []
|
343 |
try:
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
self.
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
# Load the page
|
362 |
-
await self.page.goto(final_url, timeout=30000, wait_until='networkidle')
|
363 |
content = await self.page.content()
|
364 |
soup = BeautifulSoup(content, 'html.parser')
|
365 |
|
366 |
-
# Define extensions to look for
|
367 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
368 |
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
369 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
@@ -371,73 +321,70 @@ class DownloadManager:
|
|
371 |
parsed_base = urlparse(final_url)
|
372 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
373 |
|
374 |
-
# Find all links including those in scripts and other elements
|
375 |
-
links = set()
|
376 |
-
# Regular links
|
377 |
for a in soup.find_all('a', href=True):
|
378 |
-
|
379 |
-
# Script-embedded links
|
380 |
-
scripts = soup.find_all('script')
|
381 |
-
for script in scripts:
|
382 |
-
if script.string:
|
383 |
-
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
|
384 |
-
links.update(urls)
|
385 |
-
|
386 |
-
for href in links:
|
387 |
-
href = href.strip()
|
388 |
-
|
389 |
-
# Skip empty or javascript links
|
390 |
-
if not href or href.startswith(('javascript:', '#', 'mailto:')):
|
391 |
-
continue
|
392 |
|
393 |
-
# Handle
|
394 |
-
if '.php' in href.lower() or 'download' in href.lower()
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
# Check if it leads to a file
|
407 |
-
content_type = real_headers.get('content-type', '').lower()
|
408 |
-
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
409 |
-
found_files.append({
|
410 |
-
'url': real_url,
|
411 |
-
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
412 |
-
'size': await self.get_file_size(real_url),
|
413 |
-
'metadata': {}
|
414 |
-
})
|
415 |
-
except Exception as e:
|
416 |
-
logger.error(f"Error processing PHP/script link {href}: {e}")
|
417 |
continue
|
418 |
-
|
419 |
# Handle direct file links
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
href = base_url + href
|
425 |
-
else:
|
426 |
-
href = base_url + '/' + href
|
427 |
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
meta =
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
|
442 |
# Make results unique based on URLs
|
443 |
seen_urls = set()
|
@@ -448,10 +395,10 @@ class DownloadManager:
|
|
448 |
unique_files.append(f)
|
449 |
|
450 |
return unique_files
|
451 |
-
|
452 |
except Exception as e:
|
453 |
logger.error(f"Error extracting files from {url}: {e}")
|
454 |
return []
|
|
|
455 |
async def download_file(self, file_info, save_dir, referer):
|
456 |
file_url = file_info['url']
|
457 |
fname = file_info['filename']
|
@@ -465,14 +412,11 @@ class DownloadManager:
|
|
465 |
os.makedirs(save_dir, exist_ok=True)
|
466 |
|
467 |
try:
|
468 |
-
|
469 |
-
real_url, _ = await self.get_real_url(file_url)
|
470 |
-
|
471 |
-
if "drive.google.com" in real_url:
|
472 |
import gdown
|
473 |
try:
|
474 |
st.write(f"Downloading from Google Drive: {fname}")
|
475 |
-
output = gdown.download(
|
476 |
if output:
|
477 |
return path
|
478 |
return None
|
@@ -489,7 +433,7 @@ class DownloadManager:
|
|
489 |
'Referer': referer
|
490 |
}
|
491 |
|
492 |
-
response = await page.request.get(
|
493 |
|
494 |
if response.status == 200:
|
495 |
content = await response.body()
|
@@ -497,224 +441,102 @@ class DownloadManager:
|
|
497 |
f.write(content)
|
498 |
return path
|
499 |
else:
|
500 |
-
logger.error(f"Download failed with status {response.status}: {
|
501 |
return None
|
502 |
|
503 |
except Exception as e:
|
504 |
logger.error(f"Error downloading {file_url}: {e}")
|
505 |
return None
|
506 |
|
507 |
-
async def
|
508 |
-
try:
|
509 |
-
# Get the real URL first
|
510 |
-
real_url, _ = await self.get_real_url(url)
|
511 |
-
await self.page.goto(real_url, timeout=30000)
|
512 |
-
|
513 |
-
# Wait for dynamic content
|
514 |
-
await self.page.wait_for_load_state('networkidle')
|
515 |
-
|
516 |
-
content = await self.page.content()
|
517 |
-
soup = BeautifulSoup(content, 'html.parser')
|
518 |
-
|
519 |
-
parsed_base = urlparse(real_url)
|
520 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
521 |
-
current_path = os.path.dirname(parsed_base.path)
|
522 |
-
|
523 |
-
links = set()
|
524 |
-
|
525 |
-
# Find links from various sources
|
526 |
-
# 1. Regular links
|
527 |
-
for a in soup.find_all('a', href=True):
|
528 |
-
href = a['href'].strip()
|
529 |
-
if href and not href.startswith(('javascript:', '#', 'mailto:')):
|
530 |
-
links.add(href)
|
531 |
-
|
532 |
-
# 2. Script-embedded links
|
533 |
-
scripts = soup.find_all('script')
|
534 |
-
for script in scripts:
|
535 |
-
if script.string:
|
536 |
-
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
|
537 |
-
links.update(urls)
|
538 |
-
|
539 |
-
# 3. Form actions
|
540 |
-
forms = soup.find_all('form', action=True)
|
541 |
-
for form in forms:
|
542 |
-
links.add(form['action'])
|
543 |
-
|
544 |
-
# Process and clean links
|
545 |
-
clean_links = set()
|
546 |
-
for href in links:
|
547 |
-
try:
|
548 |
-
# Skip empty links
|
549 |
-
if not href.strip():
|
550 |
-
continue
|
551 |
-
|
552 |
-
# Convert to absolute URL
|
553 |
-
if href.startswith('http'):
|
554 |
-
full_url = href
|
555 |
-
elif href.startswith('//'):
|
556 |
-
full_url = parsed_base.scheme + ':' + href
|
557 |
-
elif href.startswith('/'):
|
558 |
-
full_url = base_url + href
|
559 |
-
else:
|
560 |
-
# Handle relative paths
|
561 |
-
if current_path and current_path != '/':
|
562 |
-
full_url = base_url + current_path + '/' + href
|
563 |
-
else:
|
564 |
-
full_url = base_url + '/' + href
|
565 |
-
|
566 |
-
# Clean the URL
|
567 |
-
full_url = full_url.split('#')[0] # Remove fragments
|
568 |
-
|
569 |
-
# Only add if it's a related domain
|
570 |
-
if self.is_related_domain(full_url):
|
571 |
-
clean_links.add(full_url)
|
572 |
-
|
573 |
-
except Exception as e:
|
574 |
-
logger.error(f"Error processing link {href}: {e}")
|
575 |
-
continue
|
576 |
-
|
577 |
-
# Sort links for consistency
|
578 |
-
sorted_links = sorted(list(clean_links))
|
579 |
-
return sorted_links[:limit]
|
580 |
-
|
581 |
-
except Exception as e:
|
582 |
-
logger.error(f"Error getting sublinks: {e}")
|
583 |
-
return []
|
584 |
-
|
585 |
-
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
|
586 |
if not custom_ext_list:
|
587 |
custom_ext_list = []
|
588 |
-
|
589 |
progress_text = st.empty()
|
590 |
progress_bar = st.progress(0)
|
591 |
file_count_text = st.empty()
|
592 |
-
|
593 |
-
try:
|
594 |
-
# Initialize base domains with the original URL
|
595 |
-
self.get_base_domain(url)
|
596 |
|
597 |
-
|
598 |
-
|
599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
600 |
total_links = len(sublinks)
|
601 |
-
|
602 |
progress_text.text(f"Found {total_links} sublinks to process")
|
603 |
-
|
|
|
604 |
progress_bar.progress(1.0)
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
try:
|
617 |
-
progress = (index) / total_links
|
618 |
-
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
|
619 |
-
progress_bar.progress(progress)
|
620 |
-
|
621 |
-
async with async_timeout.timeout(timeout):
|
622 |
-
# Get the final URL and headers for this sublink
|
623 |
-
real_url, headers = await self.get_real_url(sublink)
|
624 |
-
content_type = headers.get('content-type', '').lower()
|
625 |
-
|
626 |
-
# If the sublink itself is a downloadable file, return it
|
627 |
-
if any(x in content_type for x in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
628 |
-
return [{
|
629 |
-
'url': real_url,
|
630 |
-
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
631 |
-
'size': await self.get_file_size(real_url),
|
632 |
-
'metadata': {}
|
633 |
-
}]
|
634 |
-
|
635 |
-
# Otherwise, treat it as a webpage and search for file links
|
636 |
-
await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
|
637 |
-
content = await self.page.content()
|
638 |
-
soup = BeautifulSoup(content, 'html.parser')
|
639 |
-
|
640 |
-
# Define default and custom file extensions
|
641 |
-
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
642 |
-
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
643 |
-
custom_exts = [ext.strip().lower() for ext in custom_ext_list if ext.strip()]
|
644 |
-
file_exts = set(default_exts + custom_exts)
|
645 |
-
|
646 |
-
sublink_files = []
|
647 |
-
# Iterate over all anchor tags found on the page
|
648 |
-
for a in soup.find_all('a', href=True):
|
649 |
-
href = a['href'].strip()
|
650 |
-
if not href:
|
651 |
-
continue
|
652 |
-
# Convert any relative URL to an absolute URL
|
653 |
-
full_url = urljoin(real_url, href)
|
654 |
-
if any(full_url.lower().endswith(ext) for ext in file_exts):
|
655 |
-
final_url, _ = await self.get_real_url(full_url)
|
656 |
-
file_info = {
|
657 |
-
'url': final_url,
|
658 |
-
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
|
659 |
-
'size': await self.get_file_size(final_url),
|
660 |
-
'metadata': {}
|
661 |
-
}
|
662 |
-
if final_url.lower().endswith('.pdf'):
|
663 |
-
file_info['metadata'] = await self.get_pdf_metadata(final_url)
|
664 |
-
sublink_files.append(file_info)
|
665 |
-
|
666 |
-
if sublink_files:
|
667 |
-
logger.info(f"Found {len(sublink_files)} files at {real_url}")
|
668 |
-
st.write(f"Found {len(sublink_files)} files at {real_url}")
|
669 |
-
|
670 |
-
return sublink_files
|
671 |
-
|
672 |
-
except asyncio.TimeoutError:
|
673 |
-
logger.warning(f"Timeout processing sublink: {sublink}")
|
674 |
-
return []
|
675 |
-
except Exception as e:
|
676 |
-
logger.error(f"Error processing sublink {sublink}: {e}")
|
677 |
-
return []
|
678 |
-
|
679 |
-
# Process all sublinks concurrently
|
680 |
-
tasks = [process_sublink(sublink, i + 1) for i, sublink in enumerate(sublinks)]
|
681 |
-
sub_results = await asyncio.gather(*tasks)
|
682 |
-
|
683 |
-
# Combine all results
|
684 |
-
for sub_files in sub_results:
|
685 |
all_files.extend(sub_files)
|
|
|
|
|
686 |
file_count_text.text(f"Found {len(all_files)} total files")
|
687 |
-
|
688 |
-
#
|
689 |
seen_urls = set()
|
690 |
unique_files = []
|
|
|
691 |
for f in all_files:
|
692 |
if f['url'] not in seen_urls:
|
693 |
seen_urls.add(f['url'])
|
694 |
unique_files.append(f)
|
695 |
-
|
696 |
final_count = len(unique_files)
|
697 |
-
progress_text.text("Deep search complete!")
|
698 |
file_count_text.text(f"Found {final_count} unique files")
|
699 |
progress_bar.progress(1.0)
|
700 |
|
701 |
-
# Sort files by filename for consistency
|
702 |
-
unique_files.sort(key=lambda x: x['filename'].lower())
|
703 |
-
|
704 |
return unique_files
|
705 |
-
|
706 |
except Exception as e:
|
707 |
logger.error(f"Deep search error: {e}")
|
708 |
progress_text.text(f"Error during deep search: {str(e)}")
|
709 |
return []
|
710 |
finally:
|
|
|
711 |
await asyncio.sleep(2)
|
712 |
-
|
713 |
progress_text.empty()
|
714 |
progress_bar.empty()
|
715 |
-
|
716 |
-
|
717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
|
719 |
def main():
|
720 |
if 'initialized' not in st.session_state:
|
@@ -741,8 +563,8 @@ def main():
|
|
741 |
max_sublinks = st.number_input(
|
742 |
"Maximum Sublinks to Process",
|
743 |
min_value=1,
|
744 |
-
max_value=
|
745 |
-
value=
|
746 |
step=50,
|
747 |
help="Maximum number of sublinks to process from the main page",
|
748 |
key="max_sublinks_input"
|
|
|
217 |
self.browser = None
|
218 |
self.context = None
|
219 |
self.page = None
|
|
|
220 |
|
221 |
async def __aenter__(self):
|
222 |
self.playwright = await async_playwright().start()
|
|
|
250 |
if self.playwright:
|
251 |
await self.playwright.stop()
|
252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
async def get_file_size(self, url):
|
254 |
try:
|
255 |
async with self.context.new_page() as page:
|
|
|
280 |
except Exception:
|
281 |
return {}
|
282 |
|
283 |
+
async def extract_real_download_url(self, url):
|
284 |
+
try:
|
285 |
+
async with self.context.new_page() as page:
|
286 |
+
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
287 |
+
if response and response.headers.get('location'):
|
288 |
+
return response.headers['location']
|
289 |
+
return page.url
|
290 |
+
except Exception as e:
|
291 |
+
logger.error(f"Error extracting real download URL: {e}")
|
292 |
+
return url
|
293 |
+
|
294 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
295 |
found_files = []
|
296 |
try:
|
297 |
+
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
298 |
+
if not response:
|
299 |
+
return []
|
300 |
+
|
301 |
+
final_url = self.page.url
|
302 |
+
if '.php' in final_url or 'download' in final_url:
|
303 |
+
real_url = await self.extract_real_download_url(final_url)
|
304 |
+
if real_url != final_url:
|
305 |
+
found_files.append({
|
306 |
+
'url': real_url,
|
307 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
308 |
+
'size': await self.get_file_size(real_url),
|
309 |
+
'metadata': {}
|
310 |
+
})
|
311 |
+
return found_files
|
312 |
+
|
313 |
+
await self.page.wait_for_load_state('networkidle', timeout=30000)
|
|
|
|
|
314 |
content = await self.page.content()
|
315 |
soup = BeautifulSoup(content, 'html.parser')
|
316 |
|
|
|
317 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
318 |
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
319 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
|
|
321 |
parsed_base = urlparse(final_url)
|
322 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
323 |
|
|
|
|
|
|
|
324 |
for a in soup.find_all('a', href=True):
|
325 |
+
href = a['href'].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
+
# Handle PHP scripts and redirects
|
328 |
+
if '.php' in href.lower() or 'download' in href.lower():
|
329 |
+
full_url = href if href.startswith('http') else (
|
330 |
+
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
331 |
+
)
|
332 |
+
real_url = await self.extract_real_download_url(full_url)
|
333 |
+
if real_url and real_url != full_url:
|
334 |
+
found_files.append({
|
335 |
+
'url': real_url,
|
336 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
337 |
+
'size': await self.get_file_size(real_url),
|
338 |
+
'metadata': {}
|
339 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
continue
|
341 |
+
|
342 |
# Handle direct file links
|
343 |
+
if any(href.lower().endswith(ext) for ext in all_exts):
|
344 |
+
file_url = href if href.startswith('http') else (
|
345 |
+
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
346 |
+
)
|
|
|
|
|
|
|
347 |
|
348 |
+
size_str = await self.get_file_size(file_url)
|
349 |
+
meta = {}
|
350 |
+
if file_url.lower().endswith('.pdf'):
|
351 |
+
meta = await self.get_pdf_metadata(file_url)
|
352 |
+
|
353 |
+
found_files.append({
|
354 |
+
'url': file_url,
|
355 |
+
'filename': os.path.basename(file_url.split('?')[0]),
|
356 |
+
'size': size_str,
|
357 |
+
'metadata': meta
|
358 |
+
})
|
359 |
+
|
360 |
+
# Handle Google Drive links
|
361 |
+
elif ("drive.google.com" in href) or ("docs.google.com" in href):
|
362 |
+
file_id = None
|
363 |
+
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
364 |
+
match = re.search(pattern, href)
|
365 |
+
if match:
|
366 |
+
file_id = match.group(1)
|
367 |
+
break
|
368 |
+
|
369 |
+
if file_id:
|
370 |
+
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
371 |
+
filename = file_id
|
372 |
+
try:
|
373 |
+
response = await self.page.request.head(direct_url, timeout=15000)
|
374 |
+
cd = response.headers.get("Content-Disposition", "")
|
375 |
+
if cd:
|
376 |
+
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
377 |
+
if mt:
|
378 |
+
filename = mt.group(1).strip('"').strip()
|
379 |
+
|
380 |
+
found_files.append({
|
381 |
+
'url': direct_url,
|
382 |
+
'filename': filename,
|
383 |
+
'size': await self.get_file_size(direct_url),
|
384 |
+
'metadata': {}
|
385 |
+
})
|
386 |
+
except Exception as e:
|
387 |
+
logger.error(f"Error processing Google Drive link: {e}")
|
388 |
|
389 |
# Make results unique based on URLs
|
390 |
seen_urls = set()
|
|
|
395 |
unique_files.append(f)
|
396 |
|
397 |
return unique_files
|
|
|
398 |
except Exception as e:
|
399 |
logger.error(f"Error extracting files from {url}: {e}")
|
400 |
return []
|
401 |
+
|
402 |
async def download_file(self, file_info, save_dir, referer):
|
403 |
file_url = file_info['url']
|
404 |
fname = file_info['filename']
|
|
|
412 |
os.makedirs(save_dir, exist_ok=True)
|
413 |
|
414 |
try:
|
415 |
+
if "drive.google.com" in file_url:
|
|
|
|
|
|
|
416 |
import gdown
|
417 |
try:
|
418 |
st.write(f"Downloading from Google Drive: {fname}")
|
419 |
+
output = gdown.download(file_url, path, quiet=False)
|
420 |
if output:
|
421 |
return path
|
422 |
return None
|
|
|
433 |
'Referer': referer
|
434 |
}
|
435 |
|
436 |
+
response = await page.request.get(file_url, headers=headers, timeout=30000)
|
437 |
|
438 |
if response.status == 200:
|
439 |
content = await response.body()
|
|
|
441 |
f.write(content)
|
442 |
return path
|
443 |
else:
|
444 |
+
logger.error(f"Download failed with status {response.status}: {file_url}")
|
445 |
return None
|
446 |
|
447 |
except Exception as e:
|
448 |
logger.error(f"Error downloading {file_url}: {e}")
|
449 |
return None
|
450 |
|
451 |
+
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
if not custom_ext_list:
|
453 |
custom_ext_list = []
|
454 |
+
|
455 |
progress_text = st.empty()
|
456 |
progress_bar = st.progress(0)
|
457 |
file_count_text = st.empty()
|
|
|
|
|
|
|
|
|
458 |
|
459 |
+
try:
|
460 |
+
# Search main page
|
461 |
+
progress_text.text("Analyzing main page...")
|
462 |
+
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
463 |
+
initial_count = len(main_files)
|
464 |
+
file_count_text.text(f"Found {initial_count} files on main page")
|
465 |
+
|
466 |
+
# Get and search sublinks
|
467 |
+
progress_text.text("Getting sublinks...")
|
468 |
+
sublinks = await self.get_sublinks(url, sublink_limit)
|
469 |
total_links = len(sublinks)
|
470 |
+
|
471 |
progress_text.text(f"Found {total_links} sublinks to process")
|
472 |
+
|
473 |
+
if not sublinks:
|
474 |
progress_bar.progress(1.0)
|
475 |
+
return main_files
|
476 |
+
|
477 |
+
# Process sublinks
|
478 |
+
all_files = main_files
|
479 |
+
|
480 |
+
for i, sublink in enumerate(sublinks, 1):
|
481 |
+
progress = i/total_links
|
482 |
+
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
483 |
+
progress_bar.progress(progress)
|
484 |
+
|
485 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
all_files.extend(sub_files)
|
487 |
+
|
488 |
+
# Update count in real-time
|
489 |
file_count_text.text(f"Found {len(all_files)} total files")
|
490 |
+
|
491 |
+
# Make results unique
|
492 |
seen_urls = set()
|
493 |
unique_files = []
|
494 |
+
|
495 |
for f in all_files:
|
496 |
if f['url'] not in seen_urls:
|
497 |
seen_urls.add(f['url'])
|
498 |
unique_files.append(f)
|
499 |
+
|
500 |
final_count = len(unique_files)
|
501 |
+
progress_text.text(f"Deep search complete!")
|
502 |
file_count_text.text(f"Found {final_count} unique files")
|
503 |
progress_bar.progress(1.0)
|
504 |
|
|
|
|
|
|
|
505 |
return unique_files
|
506 |
+
|
507 |
except Exception as e:
|
508 |
logger.error(f"Deep search error: {e}")
|
509 |
progress_text.text(f"Error during deep search: {str(e)}")
|
510 |
return []
|
511 |
finally:
|
512 |
+
# Clean up progress indicators after a delay
|
513 |
await asyncio.sleep(2)
|
514 |
+
if not st.session_state.get('keep_progress', False):
|
515 |
progress_text.empty()
|
516 |
progress_bar.empty()
|
517 |
+
|
518 |
+
async def get_sublinks(self, url, limit=10000):
|
519 |
+
try:
|
520 |
+
await self.page.goto(url, timeout=30000)
|
521 |
+
content = await self.page.content()
|
522 |
+
soup = BeautifulSoup(content, 'html.parser')
|
523 |
+
|
524 |
+
parsed_base = urlparse(url)
|
525 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
526 |
+
|
527 |
+
links = set()
|
528 |
+
for a in soup.find_all('a', href=True):
|
529 |
+
href = a['href'].strip()
|
530 |
+
if href.startswith('http'):
|
531 |
+
links.add(href)
|
532 |
+
elif href.startswith('/'):
|
533 |
+
links.add(f"{base_url}{href}")
|
534 |
+
|
535 |
+
return list(links)[:limit]
|
536 |
+
|
537 |
+
except Exception as e:
|
538 |
+
logger.error(f"Error getting sublinks: {e}")
|
539 |
+
return []
|
540 |
|
541 |
def main():
|
542 |
if 'initialized' not in st.session_state:
|
|
|
563 |
max_sublinks = st.number_input(
|
564 |
"Maximum Sublinks to Process",
|
565 |
min_value=1,
|
566 |
+
max_value=100000,
|
567 |
+
value=10000,
|
568 |
step=50,
|
569 |
help="Maximum number of sublinks to process from the main page",
|
570 |
key="max_sublinks_input"
|