euler314 commited on
Commit
61d830e
·
verified ·
1 Parent(s): 56dd83d

Create app/utils.py

Browse files
Files changed (1) hide show
  1. app/utils.py +191 -0
app/utils.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import re
4
+ import zipfile
5
+ import datetime
6
+ import logging
7
+ import mimetypes
8
+ from urllib.parse import urlparse, parse_qs, quote, unquote
9
+ import requests
10
+
11
+ # Setup logging
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s - %(levelname)s - %(message)s',
15
+ handlers=[
16
+ logging.FileHandler('app.log'),
17
+ logging.StreamHandler()
18
+ ]
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # User agent list
23
+ USER_AGENTS = [
24
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
25
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
26
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
27
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
28
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
29
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
30
+ 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
31
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
32
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
33
+ ]
34
+
35
+ # Stealth browser settings
36
+ STEALTH_SETTINGS = {
37
+ "hardware_concurrency": 4,
38
+ "device_memory": 8,
39
+ "webgl_vendor": "Google Inc. (Intel)",
40
+ "webgl_renderer": "Intel Iris OpenGL Engine",
41
+ "languages": ["en-US", "en"],
42
+ "disable_webrtc": True,
43
+ "navigator_platform": "Win32",
44
+ "touch_support": False
45
+ }
46
+
47
+ # Proxy rotation configuration
48
+ PROXY_ROTATION_CONFIG = {
49
+ "enabled": False,
50
+ "rotation_interval": 10,
51
+ "proxies": []
52
+ }
53
+
54
+ def get_random_user_agent():
55
+ """Return a random user agent from the list"""
56
+ return random.choice(USER_AGENTS)
57
+
58
+ def sizeof_fmt(num, suffix='B'):
59
+ """Format file size in human-readable format"""
60
+ for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
61
+ if abs(num) < 1024.0:
62
+ return f"{num:3.1f}{unit}{suffix}"
63
+ num /= 1024.0
64
+ return f"{num:.1f}Y{suffix}"
65
+
66
+ def create_zip_file(file_paths, output_dir):
67
+ """Create a ZIP file containing the given files"""
68
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
69
+ zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
70
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
71
+ for file_path in file_paths:
72
+ zipf.write(file_path, os.path.basename(file_path))
73
+ return zip_path
74
+
75
+ def get_file_extension(url, default='.pdf'):
76
+ """Extract file extension from URL or filename"""
77
+ path = urlparse(url).path
78
+ ext = os.path.splitext(path)[1].lower()
79
+ if not ext:
80
+ return default
81
+ return ext
82
+
83
+ def humanize_file_size(size_bytes):
84
+ """Format file size in human-readable format"""
85
+ if size_bytes < 1024:
86
+ return f"{size_bytes} bytes"
87
+ for unit in ['KB', 'MB', 'GB', 'TB']:
88
+ size_bytes /= 1024.0
89
+ if size_bytes < 1024.0:
90
+ return f"{size_bytes:.1f} {unit}"
91
+ return f"{size_bytes:.1f} PB"
92
+
93
+ def get_domain(url):
94
+ """Extract domain from URL"""
95
+ parsed = urlparse(url)
96
+ return parsed.netloc
97
+
98
+ def is_valid_file_url(url, extensions):
99
+ """Check if URL is a valid file URL based on extension"""
100
+ return any(url.lower().endswith(ext) for ext in extensions)
101
+
102
+ def detect_captcha(html_content):
103
+ """Detect common captcha patterns in HTML content"""
104
+ captcha_patterns = [
105
+ 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
106
+ 'challenge', 'solve the following', 'verify you are human'
107
+ ]
108
+ html_lower = html_content.lower()
109
+ return any(pattern in html_lower for pattern in captcha_patterns)
110
+
111
+ def is_download_link(url):
112
+ """Enhanced function to detect if a URL is likely a download link"""
113
+ url_lower = url.lower()
114
+
115
+ # Check for common download-related terms
116
+ download_terms = [
117
+ 'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
118
+ 'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
119
+ ]
120
+ if any(term in url_lower for term in download_terms):
121
+ return True
122
+
123
+ # Check for common download script patterns
124
+ script_patterns = [
125
+ 'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
126
+ 'download.aspx', 'getfile.aspx', 'file.aspx',
127
+ 'downloadhandler', 'filehandler', 'filedownload',
128
+ 'download.jsp', 'download.cgi', 'download.do',
129
+ 'download-file', 'get-file',
130
+ 'downloadfile', 'getfile', 'viewfile',
131
+ 'Action=downloadfile', 'action=download', 'action=view',
132
+ 'download?', 'file?', 'get?', 'view?'
133
+ ]
134
+ if any(pattern in url_lower for pattern in script_patterns):
135
+ return True
136
+
137
+ # Check for common file extensions
138
+ path = urlparse(url).path
139
+ common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
140
+ '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
141
+ '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
142
+
143
+ if any(ext in path.lower() for ext in common_extensions):
144
+ return True
145
+
146
+ # Check for file parameters in URL
147
+ params = parse_qs(urlparse(url).query)
148
+ param_keys = params.keys()
149
+ file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
150
+ if any(key.lower() in file_param_indicators for key in param_keys):
151
+ return True
152
+
153
+ # Check for complex encoding patterns
154
+ if 'Action=downloadfile' in url or 'fname=' in url:
155
+ return True
156
+
157
+ return False
158
+
159
+ def normalize_download_url(url):
160
+ """Normalize download URLs to handle various formats and encodings"""
161
+ try:
162
+ parsed = urlparse(url)
163
+
164
+ # Handle phpMyAdmin-style encoded URLs
165
+ if 'Action=downloadfile' in url and 'file=' in url:
166
+ # Keep the URL as is for now
167
+ return url
168
+
169
+ # Handle URLs with fname parameter
170
+ if 'fname=' in url:
171
+ return url
172
+
173
+ # Quote the path portion if needed
174
+ path = parsed.path
175
+ if '%' not in path and ' ' in path:
176
+ path = quote(path)
177
+
178
+ # Reconstruct the URL
179
+ normalized = parsed._replace(path=path).geturl()
180
+ return normalized
181
+ except Exception as e:
182
+ logger.error(f"Error normalizing URL {url}: {e}")
183
+ return url
184
+
185
+ def show_user_friendly_error(error_type, details, suggestion=None):
186
+ """Display a user-friendly error message with suggestions"""
187
+ import streamlit as st
188
+ with st.error(f"**{error_type}**"):
189
+ st.write(details)
190
+ if suggestion:
191
+ st.info(f"**Suggestion**: {suggestion}")