Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

File size: 6,536 Bytes

f9f65ef

"""
Configuration settings for AI Web Scraper
Centralized configuration management for security, performance, and features
"""

import os
from typing import Dict, List, Optional
from dataclasses import dataclass

@dataclass
class SecurityConfig:
    """Security-related configuration"""
    # URL validation settings
    allowed_schemes: List[str] = None
    blocked_domains: List[str] = None
    max_url_length: int = 2048
    
    # Rate limiting
    requests_per_minute: int = 30
    requests_per_hour: int = 500
    
    # Content safety
    max_content_size: int = 10 * 1024 * 1024  # 10MB
    max_processing_time: int = 60  # seconds
    
    def __post_init__(self):
        if self.allowed_schemes is None:
            self.allowed_schemes = ['http', 'https']
        
        if self.blocked_domains is None:
            self.blocked_domains = [
                'localhost', '127.0.0.1', '0.0.0.0',
                '192.168.', '10.', '172.16.', '172.17.',
                '172.18.', '172.19.', '172.20.', '172.21.',
                '172.22.', '172.23.', '172.24.', '172.25.',
                '172.26.', '172.27.', '172.28.', '172.29.',
                '172.30.', '172.31.'
            ]

@dataclass
class ModelConfig:
    """AI model configuration"""
    # Primary summarization model
    primary_model: str = "facebook/bart-large-cnn"
    
    # Fallback model for faster processing
    fallback_model: str = "sshleifer/distilbart-cnn-12-6"
    
    # Model parameters
    max_input_length: int = 1024
    max_summary_length: int = 500
    min_summary_length: int = 30
    
    # Performance settings
    device: str = "auto"  # auto, cpu, cuda
    batch_size: int = 1
    use_fast_tokenizer: bool = True

@dataclass
class ScrapingConfig:
    """Web scraping configuration"""
    # Request settings
    timeout: int = 15
    max_retries: int = 3
    retry_delay: int = 1
    
    # User agent string
    user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)"
    
    # Content extraction
    min_content_length: int = 100
    max_content_length: int = 100000
    
    # Robots.txt settings
    respect_robots_txt: bool = True
    robots_cache_duration: int = 3600  # seconds

@dataclass
class UIConfig:
    """User interface configuration"""
    # Default values
    default_summary_length: int = 300
    max_summary_length: int = 500
    min_summary_length: int = 100
    
    # Interface settings
    enable_batch_processing: bool = True
    max_batch_size: int = 10
    show_advanced_options: bool = False
    
    # Export settings
    supported_export_formats: List[str] = None
    
    def __post_init__(self):
        if self.supported_export_formats is None:
            self.supported_export_formats = ["CSV", "JSON"]

class Config:
    """Main configuration class"""
    
    def __init__(self):
        self.security = SecurityConfig()
        self.models = ModelConfig()
        self.scraping = ScrapingConfig()
        self.ui = UIConfig()
        
        # Load from environment variables if available
        self._load_from_env()
    
    def _load_from_env(self):
        """Load configuration from environment variables"""
        # Security settings
        if os.getenv('MAX_REQUESTS_PER_MINUTE'):
            self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE'))
        
        if os.getenv('MAX_CONTENT_SIZE'):
            self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE'))
        
        # Model settings
        if os.getenv('PRIMARY_MODEL'):
            self.models.primary_model = os.getenv('PRIMARY_MODEL')
        
        if os.getenv('FALLBACK_MODEL'):
            self.models.fallback_model = os.getenv('FALLBACK_MODEL')
        
        if os.getenv('DEVICE'):
            self.models.device = os.getenv('DEVICE')
        
        # Scraping settings
        if os.getenv('REQUEST_TIMEOUT'):
            self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT'))
        
        if os.getenv('USER_AGENT'):
            self.scraping.user_agent = os.getenv('USER_AGENT')
        
        if os.getenv('RESPECT_ROBOTS_TXT'):
            self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true'
    
    def get_model_device(self) -> str:
        """Get the appropriate device for model inference"""
        if self.models.device == "auto":
            try:
                import torch
                return "cuda" if torch.cuda.is_available() else "cpu"
            except ImportError:
                return "cpu"
        return self.models.device
    
    def is_url_allowed(self, url: str) -> bool:
        """Check if URL is allowed based on security settings"""
        from urllib.parse import urlparse
        
        try:
            parsed = urlparse(url)
            
            # Check scheme
            if parsed.scheme not in self.security.allowed_schemes:
                return False
            
            # Check blocked domains
            hostname = parsed.hostname or ''
            for blocked in self.security.blocked_domains:
                if blocked in hostname:
                    return False
            
            # Check URL length
            if len(url) > self.security.max_url_length:
                return False
            
            return True
            
        except Exception:
            return False
    
    def get_request_headers(self) -> Dict[str, str]:
        """Get standard request headers"""
        return {
            'User-Agent': self.scraping.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

# Global configuration instance
config = Config()

# Environment-specific overrides for Hugging Face Spaces
if os.getenv('SPACE_ID'):
    # Running on Hugging Face Spaces
    config.models.device = "auto"
    config.security.requests_per_minute = 20  # More conservative on shared infrastructure
    config.scraping.timeout = 10  # Shorter timeout on shared infrastructure
    
    # Enable GPU if available
    if os.getenv('CUDA_VISIBLE_DEVICES'):
        config.models.device = "cuda"

# Development mode overrides
if os.getenv('ENVIRONMENT') == 'development':
    config.security.requests_per_minute = 100
    config.scraping.timeout = 30
    config.ui.show_advanced_options = True