|
import os |
|
import random |
|
import re |
|
import zipfile |
|
import datetime |
|
import logging |
|
import mimetypes |
|
from urllib.parse import urlparse, parse_qs, quote, unquote |
|
import requests |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler('app.log'), |
|
logging.StreamHandler() |
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', |
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' |
|
] |
|
|
|
|
|
STEALTH_SETTINGS = { |
|
"hardware_concurrency": 4, |
|
"device_memory": 8, |
|
"webgl_vendor": "Google Inc. (Intel)", |
|
"webgl_renderer": "Intel Iris OpenGL Engine", |
|
"languages": ["en-US", "en"], |
|
"disable_webrtc": True, |
|
"navigator_platform": "Win32", |
|
"touch_support": False |
|
} |
|
|
|
|
|
PROXY_ROTATION_CONFIG = { |
|
"enabled": False, |
|
"rotation_interval": 10, |
|
"proxies": [] |
|
} |
|
|
|
def get_random_user_agent(): |
|
"""Return a random user agent from the list""" |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
"""Format file size in human-readable format""" |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
"""Create a ZIP file containing the given files""" |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
return zip_path |
|
|
|
def get_file_extension(url, default='.pdf'): |
|
"""Extract file extension from URL or filename""" |
|
path = urlparse(url).path |
|
ext = os.path.splitext(path)[1].lower() |
|
if not ext: |
|
return default |
|
return ext |
|
|
|
def humanize_file_size(size_bytes): |
|
"""Format file size in human-readable format""" |
|
if size_bytes < 1024: |
|
return f"{size_bytes} bytes" |
|
for unit in ['KB', 'MB', 'GB', 'TB']: |
|
size_bytes /= 1024.0 |
|
if size_bytes < 1024.0: |
|
return f"{size_bytes:.1f} {unit}" |
|
return f"{size_bytes:.1f} PB" |
|
|
|
def get_domain(url): |
|
"""Extract domain from URL""" |
|
parsed = urlparse(url) |
|
return parsed.netloc |
|
|
|
def is_valid_file_url(url, extensions): |
|
"""Check if URL is a valid file URL based on extension""" |
|
return any(url.lower().endswith(ext) for ext in extensions) |
|
|
|
def detect_captcha(html_content): |
|
"""Detect common captcha patterns in HTML content""" |
|
captcha_patterns = [ |
|
'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', |
|
'challenge', 'solve the following', 'verify you are human' |
|
] |
|
html_lower = html_content.lower() |
|
return any(pattern in html_lower for pattern in captcha_patterns) |
|
|
|
def is_download_link(url): |
|
"""Enhanced function to detect if a URL is likely a download link""" |
|
url_lower = url.lower() |
|
|
|
|
|
download_terms = [ |
|
'download', 'dl', 'get', 'file', 'attachment', 'export', 'view', |
|
'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document' |
|
] |
|
if any(term in url_lower for term in download_terms): |
|
return True |
|
|
|
|
|
script_patterns = [ |
|
'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php', |
|
'download.aspx', 'getfile.aspx', 'file.aspx', |
|
'downloadhandler', 'filehandler', 'filedownload', |
|
'download.jsp', 'download.cgi', 'download.do', |
|
'download-file', 'get-file', |
|
'downloadfile', 'getfile', 'viewfile', |
|
'Action=downloadfile', 'action=download', 'action=view', |
|
'download?', 'file?', 'get?', 'view?' |
|
] |
|
if any(pattern in url_lower for pattern in script_patterns): |
|
return True |
|
|
|
|
|
path = urlparse(url).path |
|
common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', |
|
'.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg', |
|
'.png', '.gif', '.mp3', '.mp4', '.avi', '.mov'] |
|
|
|
if any(ext in path.lower() for ext in common_extensions): |
|
return True |
|
|
|
|
|
params = parse_qs(urlparse(url).query) |
|
param_keys = params.keys() |
|
file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid'] |
|
if any(key.lower() in file_param_indicators for key in param_keys): |
|
return True |
|
|
|
|
|
if 'Action=downloadfile' in url or 'fname=' in url: |
|
return True |
|
|
|
return False |
|
|
|
def normalize_download_url(url): |
|
"""Normalize download URLs to handle various formats and encodings""" |
|
try: |
|
parsed = urlparse(url) |
|
|
|
|
|
if 'Action=downloadfile' in url and 'file=' in url: |
|
|
|
return url |
|
|
|
|
|
if 'fname=' in url: |
|
return url |
|
|
|
|
|
path = parsed.path |
|
if '%' not in path and ' ' in path: |
|
path = quote(path) |
|
|
|
|
|
normalized = parsed._replace(path=path).geturl() |
|
return normalized |
|
except Exception as e: |
|
logger.error(f"Error normalizing URL {url}: {e}") |
|
return url |
|
|
|
def show_user_friendly_error(error_type, details, suggestion=None): |
|
"""Display a user-friendly error message with suggestions""" |
|
import streamlit as st |
|
with st.error(f"**{error_type}**"): |
|
st.write(details) |
|
if suggestion: |
|
st.info(f"**Suggestion**: {suggestion}") |