Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

craw_web / utils.py

euler314

Rename app/utils.py to utils.py

7138197 verified 3 months ago

raw

history blame

7.06 kB

	import os
	import random
	import re
	import zipfile
	import datetime
	import logging
	import mimetypes
	from urllib.parse import urlparse, parse_qs, quote, unquote
	import requests

	# Setup logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('app.log'),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	# User agent list
	USER_AGENTS = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
	'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
	'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
	]

	# Stealth browser settings
	STEALTH_SETTINGS = {
	"hardware_concurrency": 4,
	"device_memory": 8,
	"webgl_vendor": "Google Inc. (Intel)",
	"webgl_renderer": "Intel Iris OpenGL Engine",
	"languages": ["en-US", "en"],
	"disable_webrtc": True,
	"navigator_platform": "Win32",
	"touch_support": False
	}

	# Proxy rotation configuration
	PROXY_ROTATION_CONFIG = {
	"enabled": False,
	"rotation_interval": 10,
	"proxies": []
	}

	def get_random_user_agent():
	"""Return a random user agent from the list"""
	return random.choice(USER_AGENTS)

	def sizeof_fmt(num, suffix='B'):
	"""Format file size in human-readable format"""
	for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
	if abs(num) < 1024.0:
	return f"{num:3.1f}{unit}{suffix}"
	num /= 1024.0
	return f"{num:.1f}Y{suffix}"

	def create_zip_file(file_paths, output_dir):
	"""Create a ZIP file containing the given files"""
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
	with zipfile.ZipFile(zip_path, 'w') as zipf:
	for file_path in file_paths:
	zipf.write(file_path, os.path.basename(file_path))
	return zip_path

	def get_file_extension(url, default='.pdf'):
	"""Extract file extension from URL or filename"""
	path = urlparse(url).path
	ext = os.path.splitext(path)[1].lower()
	if not ext:
	return default
	return ext

	def humanize_file_size(size_bytes):
	"""Format file size in human-readable format"""
	if size_bytes < 1024:
	return f"{size_bytes} bytes"
	for unit in ['KB', 'MB', 'GB', 'TB']:
	size_bytes /= 1024.0
	if size_bytes < 1024.0:
	return f"{size_bytes:.1f} {unit}"
	return f"{size_bytes:.1f} PB"

	def get_domain(url):
	"""Extract domain from URL"""
	parsed = urlparse(url)
	return parsed.netloc

	def is_valid_file_url(url, extensions):
	"""Check if URL is a valid file URL based on extension"""
	return any(url.lower().endswith(ext) for ext in extensions)

	def detect_captcha(html_content):
	"""Detect common captcha patterns in HTML content"""
	captcha_patterns = [
	'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
	'challenge', 'solve the following', 'verify you are human'
	]
	html_lower = html_content.lower()
	return any(pattern in html_lower for pattern in captcha_patterns)

	def is_download_link(url):
	"""Enhanced function to detect if a URL is likely a download link"""
	url_lower = url.lower()

	# Check for common download-related terms
	download_terms = [
	'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
	'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
	]
	if any(term in url_lower for term in download_terms):
	return True

	# Check for common download script patterns
	script_patterns = [
	'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
	'download.aspx', 'getfile.aspx', 'file.aspx',
	'downloadhandler', 'filehandler', 'filedownload',
	'download.jsp', 'download.cgi', 'download.do',
	'download-file', 'get-file',
	'downloadfile', 'getfile', 'viewfile',
	'Action=downloadfile', 'action=download', 'action=view',
	'download?', 'file?', 'get?', 'view?'
	]
	if any(pattern in url_lower for pattern in script_patterns):
	return True

	# Check for common file extensions
	path = urlparse(url).path
	common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
	'.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
	'.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']

	if any(ext in path.lower() for ext in common_extensions):
	return True

	# Check for file parameters in URL
	params = parse_qs(urlparse(url).query)
	param_keys = params.keys()
	file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
	if any(key.lower() in file_param_indicators for key in param_keys):
	return True

	# Check for complex encoding patterns
	if 'Action=downloadfile' in url or 'fname=' in url:
	return True

	return False

	def normalize_download_url(url):
	"""Normalize download URLs to handle various formats and encodings"""
	try:
	parsed = urlparse(url)

	# Handle phpMyAdmin-style encoded URLs
	if 'Action=downloadfile' in url and 'file=' in url:
	# Keep the URL as is for now
	return url

	# Handle URLs with fname parameter
	if 'fname=' in url:
	return url

	# Quote the path portion if needed
	path = parsed.path
	if '%' not in path and ' ' in path:
	path = quote(path)

	# Reconstruct the URL
	normalized = parsed._replace(path=path).geturl()
	return normalized
	except Exception as e:
	logger.error(f"Error normalizing URL {url}: {e}")
	return url

	def show_user_friendly_error(error_type, details, suggestion=None):
	"""Display a user-friendly error message with suggestions"""
	import streamlit as st
	with st.error(f"{error_type}"):
	st.write(details)
	if suggestion:
	st.info(f"Suggestion: {suggestion}")