""" Configuration settings for AI Web Scraper Centralized configuration management for security, performance, and features """ import os from typing import Dict, List, Optional from dataclasses import dataclass @dataclass class SecurityConfig: """Security-related configuration""" # URL validation settings allowed_schemes: List[str] = None blocked_domains: List[str] = None max_url_length: int = 2048 # Rate limiting requests_per_minute: int = 30 requests_per_hour: int = 500 # Content safety max_content_size: int = 10 * 1024 * 1024 # 10MB max_processing_time: int = 60 # seconds def __post_init__(self): if self.allowed_schemes is None: self.allowed_schemes = ['http', 'https'] if self.blocked_domains is None: self.blocked_domains = [ 'localhost', '127.0.0.1', '0.0.0.0', '192.168.', '10.', '172.16.', '172.17.', '172.18.', '172.19.', '172.20.', '172.21.', '172.22.', '172.23.', '172.24.', '172.25.', '172.26.', '172.27.', '172.28.', '172.29.', '172.30.', '172.31.' ] @dataclass class ModelConfig: """AI model configuration""" # Primary summarization model primary_model: str = "facebook/bart-large-cnn" # Fallback model for faster processing fallback_model: str = "sshleifer/distilbart-cnn-12-6" # Model parameters max_input_length: int = 1024 max_summary_length: int = 500 min_summary_length: int = 30 # Performance settings device: str = "auto" # auto, cpu, cuda batch_size: int = 1 use_fast_tokenizer: bool = True @dataclass class ScrapingConfig: """Web scraping configuration""" # Request settings timeout: int = 15 max_retries: int = 3 retry_delay: int = 1 # User agent string user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)" # Content extraction min_content_length: int = 100 max_content_length: int = 100000 # Robots.txt settings respect_robots_txt: bool = True robots_cache_duration: int = 3600 # seconds @dataclass class UIConfig: """User interface configuration""" # Default values default_summary_length: int = 300 max_summary_length: int = 500 min_summary_length: int = 100 # Interface settings enable_batch_processing: bool = True max_batch_size: int = 10 show_advanced_options: bool = False # Export settings supported_export_formats: List[str] = None def __post_init__(self): if self.supported_export_formats is None: self.supported_export_formats = ["CSV", "JSON"] class Config: """Main configuration class""" def __init__(self): self.security = SecurityConfig() self.models = ModelConfig() self.scraping = ScrapingConfig() self.ui = UIConfig() # Load from environment variables if available self._load_from_env() def _load_from_env(self): """Load configuration from environment variables""" # Security settings if os.getenv('MAX_REQUESTS_PER_MINUTE'): self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE')) if os.getenv('MAX_CONTENT_SIZE'): self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE')) # Model settings if os.getenv('PRIMARY_MODEL'): self.models.primary_model = os.getenv('PRIMARY_MODEL') if os.getenv('FALLBACK_MODEL'): self.models.fallback_model = os.getenv('FALLBACK_MODEL') if os.getenv('DEVICE'): self.models.device = os.getenv('DEVICE') # Scraping settings if os.getenv('REQUEST_TIMEOUT'): self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT')) if os.getenv('USER_AGENT'): self.scraping.user_agent = os.getenv('USER_AGENT') if os.getenv('RESPECT_ROBOTS_TXT'): self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true' def get_model_device(self) -> str: """Get the appropriate device for model inference""" if self.models.device == "auto": try: import torch return "cuda" if torch.cuda.is_available() else "cpu" except ImportError: return "cpu" return self.models.device def is_url_allowed(self, url: str) -> bool: """Check if URL is allowed based on security settings""" from urllib.parse import urlparse try: parsed = urlparse(url) # Check scheme if parsed.scheme not in self.security.allowed_schemes: return False # Check blocked domains hostname = parsed.hostname or '' for blocked in self.security.blocked_domains: if blocked in hostname: return False # Check URL length if len(url) > self.security.max_url_length: return False return True except Exception: return False def get_request_headers(self) -> Dict[str, str]: """Get standard request headers""" return { 'User-Agent': self.scraping.user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } # Global configuration instance config = Config() # Environment-specific overrides for Hugging Face Spaces if os.getenv('SPACE_ID'): # Running on Hugging Face Spaces config.models.device = "auto" config.security.requests_per_minute = 20 # More conservative on shared infrastructure config.scraping.timeout = 10 # Shorter timeout on shared infrastructure # Enable GPU if available if os.getenv('CUDA_VISIBLE_DEVICES'): config.models.device = "cuda" # Development mode overrides if os.getenv('ENVIRONMENT') == 'development': config.security.requests_per_minute = 100 config.scraping.timeout = 30 config.ui.show_advanced_options = True