Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Apr 8

Commit

61d830e

verified ·

1 Parent(s): 56dd83d

Create app/utils.py

Browse files

Files changed (1) hide show

app/utils.py +191 -0

app/utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import random
+import re
+import zipfile
+import datetime
+import logging
+import mimetypes
+from urllib.parse import urlparse, parse_qs, quote, unquote
+import requests
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('app.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# User agent list
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+    'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
+]
+# Stealth browser settings
+STEALTH_SETTINGS = {
+    "hardware_concurrency": 4,
+    "device_memory": 8,
+    "webgl_vendor": "Google Inc. (Intel)",
+    "webgl_renderer": "Intel Iris OpenGL Engine",
+    "languages": ["en-US", "en"],
+    "disable_webrtc": True,
+    "navigator_platform": "Win32",
+    "touch_support": False
+}
+# Proxy rotation configuration
+PROXY_ROTATION_CONFIG = {
+    "enabled": False,
+    "rotation_interval": 10,
+    "proxies": []
+}
+def get_random_user_agent():
+    """Return a random user agent from the list"""
+    return random.choice(USER_AGENTS)
+def sizeof_fmt(num, suffix='B'):
+    """Format file size in human-readable format"""
+    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Y{suffix}"
+def create_zip_file(file_paths, output_dir):
+    """Create a ZIP file containing the given files"""
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
+    with zipfile.ZipFile(zip_path, 'w') as zipf:
+        for file_path in file_paths:
+            zipf.write(file_path, os.path.basename(file_path))
+    return zip_path
+def get_file_extension(url, default='.pdf'):
+    """Extract file extension from URL or filename"""
+    path = urlparse(url).path
+    ext = os.path.splitext(path)[1].lower()
+    if not ext:
+        return default
+    return ext
+def humanize_file_size(size_bytes):
+    """Format file size in human-readable format"""
+    if size_bytes < 1024:
+        return f"{size_bytes} bytes"
+    for unit in ['KB', 'MB', 'GB', 'TB']:
+        size_bytes /= 1024.0
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+    return f"{size_bytes:.1f} PB"
+def get_domain(url):
+    """Extract domain from URL"""
+    parsed = urlparse(url)
+    return parsed.netloc
+def is_valid_file_url(url, extensions):
+    """Check if URL is a valid file URL based on extension"""
+    return any(url.lower().endswith(ext) for ext in extensions)
+def detect_captcha(html_content):
+    """Detect common captcha patterns in HTML content"""
+    captcha_patterns = [
+        'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
+        'challenge', 'solve the following', 'verify you are human'
+    ]
+    html_lower = html_content.lower()
+    return any(pattern in html_lower for pattern in captcha_patterns)
+def is_download_link(url):
+    """Enhanced function to detect if a URL is likely a download link"""
+    url_lower = url.lower()
+    # Check for common download-related terms
+    download_terms = [
+        'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
+        'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
+    ]
+    if any(term in url_lower for term in download_terms):
+        return True
+    # Check for common download script patterns
+    script_patterns = [
+        'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
+        'download.aspx', 'getfile.aspx', 'file.aspx',
+        'downloadhandler', 'filehandler', 'filedownload',
+        'download.jsp', 'download.cgi', 'download.do',
+        'download-file', 'get-file',
+        'downloadfile', 'getfile', 'viewfile',
+        'Action=downloadfile', 'action=download', 'action=view',
+        'download?', 'file?', 'get?', 'view?'
+    ]
+    if any(pattern in url_lower for pattern in script_patterns):
+        return True
+    # Check for common file extensions
+    path = urlparse(url).path
+    common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
+                         '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
+                         '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
+    if any(ext in path.lower() for ext in common_extensions):
+        return True
+    # Check for file parameters in URL
+    params = parse_qs(urlparse(url).query)
+    param_keys = params.keys()
+    file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
+    if any(key.lower() in file_param_indicators for key in param_keys):
+        return True
+    # Check for complex encoding patterns
+    if 'Action=downloadfile' in url or 'fname=' in url:
+        return True
+    return False
+def normalize_download_url(url):
+    """Normalize download URLs to handle various formats and encodings"""
+    try:
+        parsed = urlparse(url)
+        # Handle phpMyAdmin-style encoded URLs
+        if 'Action=downloadfile' in url and 'file=' in url:
+            # Keep the URL as is for now
+            return url
+        # Handle URLs with fname parameter
+        if 'fname=' in url:
+            return url
+        # Quote the path portion if needed
+        path = parsed.path
+        if '%' not in path and ' ' in path:
+            path = quote(path)
+        # Reconstruct the URL
+        normalized = parsed._replace(path=path).geturl()
+        return normalized
+    except Exception as e:
+        logger.error(f"Error normalizing URL {url}: {e}")
+        return url
+def show_user_friendly_error(error_type, details, suggestion=None):
+    """Display a user-friendly error message with suggestions"""
+    import streamlit as st
+    with st.error(f"**{error_type}**"):
+        st.write(details)
+        if suggestion:
+            st.info(f"**Suggestion**: {suggestion}")