File size: 6,536 Bytes
f9f65ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
"""
Configuration settings for AI Web Scraper
Centralized configuration management for security, performance, and features
"""
import os
from typing import Dict, List, Optional
from dataclasses import dataclass
@dataclass
class SecurityConfig:
"""Security-related configuration"""
# URL validation settings
allowed_schemes: List[str] = None
blocked_domains: List[str] = None
max_url_length: int = 2048
# Rate limiting
requests_per_minute: int = 30
requests_per_hour: int = 500
# Content safety
max_content_size: int = 10 * 1024 * 1024 # 10MB
max_processing_time: int = 60 # seconds
def __post_init__(self):
if self.allowed_schemes is None:
self.allowed_schemes = ['http', 'https']
if self.blocked_domains is None:
self.blocked_domains = [
'localhost', '127.0.0.1', '0.0.0.0',
'192.168.', '10.', '172.16.', '172.17.',
'172.18.', '172.19.', '172.20.', '172.21.',
'172.22.', '172.23.', '172.24.', '172.25.',
'172.26.', '172.27.', '172.28.', '172.29.',
'172.30.', '172.31.'
]
@dataclass
class ModelConfig:
"""AI model configuration"""
# Primary summarization model
primary_model: str = "facebook/bart-large-cnn"
# Fallback model for faster processing
fallback_model: str = "sshleifer/distilbart-cnn-12-6"
# Model parameters
max_input_length: int = 1024
max_summary_length: int = 500
min_summary_length: int = 30
# Performance settings
device: str = "auto" # auto, cpu, cuda
batch_size: int = 1
use_fast_tokenizer: bool = True
@dataclass
class ScrapingConfig:
"""Web scraping configuration"""
# Request settings
timeout: int = 15
max_retries: int = 3
retry_delay: int = 1
# User agent string
user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)"
# Content extraction
min_content_length: int = 100
max_content_length: int = 100000
# Robots.txt settings
respect_robots_txt: bool = True
robots_cache_duration: int = 3600 # seconds
@dataclass
class UIConfig:
"""User interface configuration"""
# Default values
default_summary_length: int = 300
max_summary_length: int = 500
min_summary_length: int = 100
# Interface settings
enable_batch_processing: bool = True
max_batch_size: int = 10
show_advanced_options: bool = False
# Export settings
supported_export_formats: List[str] = None
def __post_init__(self):
if self.supported_export_formats is None:
self.supported_export_formats = ["CSV", "JSON"]
class Config:
"""Main configuration class"""
def __init__(self):
self.security = SecurityConfig()
self.models = ModelConfig()
self.scraping = ScrapingConfig()
self.ui = UIConfig()
# Load from environment variables if available
self._load_from_env()
def _load_from_env(self):
"""Load configuration from environment variables"""
# Security settings
if os.getenv('MAX_REQUESTS_PER_MINUTE'):
self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE'))
if os.getenv('MAX_CONTENT_SIZE'):
self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE'))
# Model settings
if os.getenv('PRIMARY_MODEL'):
self.models.primary_model = os.getenv('PRIMARY_MODEL')
if os.getenv('FALLBACK_MODEL'):
self.models.fallback_model = os.getenv('FALLBACK_MODEL')
if os.getenv('DEVICE'):
self.models.device = os.getenv('DEVICE')
# Scraping settings
if os.getenv('REQUEST_TIMEOUT'):
self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT'))
if os.getenv('USER_AGENT'):
self.scraping.user_agent = os.getenv('USER_AGENT')
if os.getenv('RESPECT_ROBOTS_TXT'):
self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true'
def get_model_device(self) -> str:
"""Get the appropriate device for model inference"""
if self.models.device == "auto":
try:
import torch
return "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
return "cpu"
return self.models.device
def is_url_allowed(self, url: str) -> bool:
"""Check if URL is allowed based on security settings"""
from urllib.parse import urlparse
try:
parsed = urlparse(url)
# Check scheme
if parsed.scheme not in self.security.allowed_schemes:
return False
# Check blocked domains
hostname = parsed.hostname or ''
for blocked in self.security.blocked_domains:
if blocked in hostname:
return False
# Check URL length
if len(url) > self.security.max_url_length:
return False
return True
except Exception:
return False
def get_request_headers(self) -> Dict[str, str]:
"""Get standard request headers"""
return {
'User-Agent': self.scraping.user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# Global configuration instance
config = Config()
# Environment-specific overrides for Hugging Face Spaces
if os.getenv('SPACE_ID'):
# Running on Hugging Face Spaces
config.models.device = "auto"
config.security.requests_per_minute = 20 # More conservative on shared infrastructure
config.scraping.timeout = 10 # Shorter timeout on shared infrastructure
# Enable GPU if available
if os.getenv('CUDA_VISIBLE_DEVICES'):
config.models.device = "cuda"
# Development mode overrides
if os.getenv('ENVIRONMENT') == 'development':
config.security.requests_per_minute = 100
config.scraping.timeout = 30
config.ui.show_advanced_options = True |