|
""" |
|
Configuration settings for AI Web Scraper |
|
Centralized configuration management for security, performance, and features |
|
""" |
|
|
|
import os |
|
from typing import Dict, List, Optional |
|
from dataclasses import dataclass |
|
|
|
@dataclass |
|
class SecurityConfig: |
|
"""Security-related configuration""" |
|
|
|
allowed_schemes: List[str] = None |
|
blocked_domains: List[str] = None |
|
max_url_length: int = 2048 |
|
|
|
|
|
requests_per_minute: int = 30 |
|
requests_per_hour: int = 500 |
|
|
|
|
|
max_content_size: int = 10 * 1024 * 1024 |
|
max_processing_time: int = 60 |
|
|
|
def __post_init__(self): |
|
if self.allowed_schemes is None: |
|
self.allowed_schemes = ['http', 'https'] |
|
|
|
if self.blocked_domains is None: |
|
self.blocked_domains = [ |
|
'localhost', '127.0.0.1', '0.0.0.0', |
|
'192.168.', '10.', '172.16.', '172.17.', |
|
'172.18.', '172.19.', '172.20.', '172.21.', |
|
'172.22.', '172.23.', '172.24.', '172.25.', |
|
'172.26.', '172.27.', '172.28.', '172.29.', |
|
'172.30.', '172.31.' |
|
] |
|
|
|
@dataclass |
|
class ModelConfig: |
|
"""AI model configuration""" |
|
|
|
primary_model: str = "facebook/bart-large-cnn" |
|
|
|
|
|
fallback_model: str = "sshleifer/distilbart-cnn-12-6" |
|
|
|
|
|
max_input_length: int = 1024 |
|
max_summary_length: int = 500 |
|
min_summary_length: int = 30 |
|
|
|
|
|
device: str = "auto" |
|
batch_size: int = 1 |
|
use_fast_tokenizer: bool = True |
|
|
|
@dataclass |
|
class ScrapingConfig: |
|
"""Web scraping configuration""" |
|
|
|
timeout: int = 15 |
|
max_retries: int = 3 |
|
retry_delay: int = 1 |
|
|
|
|
|
user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)" |
|
|
|
|
|
min_content_length: int = 100 |
|
max_content_length: int = 100000 |
|
|
|
|
|
respect_robots_txt: bool = True |
|
robots_cache_duration: int = 3600 |
|
|
|
@dataclass |
|
class UIConfig: |
|
"""User interface configuration""" |
|
|
|
default_summary_length: int = 300 |
|
max_summary_length: int = 500 |
|
min_summary_length: int = 100 |
|
|
|
|
|
enable_batch_processing: bool = True |
|
max_batch_size: int = 10 |
|
show_advanced_options: bool = False |
|
|
|
|
|
supported_export_formats: List[str] = None |
|
|
|
def __post_init__(self): |
|
if self.supported_export_formats is None: |
|
self.supported_export_formats = ["CSV", "JSON"] |
|
|
|
class Config: |
|
"""Main configuration class""" |
|
|
|
def __init__(self): |
|
self.security = SecurityConfig() |
|
self.models = ModelConfig() |
|
self.scraping = ScrapingConfig() |
|
self.ui = UIConfig() |
|
|
|
|
|
self._load_from_env() |
|
|
|
def _load_from_env(self): |
|
"""Load configuration from environment variables""" |
|
|
|
if os.getenv('MAX_REQUESTS_PER_MINUTE'): |
|
self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE')) |
|
|
|
if os.getenv('MAX_CONTENT_SIZE'): |
|
self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE')) |
|
|
|
|
|
if os.getenv('PRIMARY_MODEL'): |
|
self.models.primary_model = os.getenv('PRIMARY_MODEL') |
|
|
|
if os.getenv('FALLBACK_MODEL'): |
|
self.models.fallback_model = os.getenv('FALLBACK_MODEL') |
|
|
|
if os.getenv('DEVICE'): |
|
self.models.device = os.getenv('DEVICE') |
|
|
|
|
|
if os.getenv('REQUEST_TIMEOUT'): |
|
self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT')) |
|
|
|
if os.getenv('USER_AGENT'): |
|
self.scraping.user_agent = os.getenv('USER_AGENT') |
|
|
|
if os.getenv('RESPECT_ROBOTS_TXT'): |
|
self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true' |
|
|
|
def get_model_device(self) -> str: |
|
"""Get the appropriate device for model inference""" |
|
if self.models.device == "auto": |
|
try: |
|
import torch |
|
return "cuda" if torch.cuda.is_available() else "cpu" |
|
except ImportError: |
|
return "cpu" |
|
return self.models.device |
|
|
|
def is_url_allowed(self, url: str) -> bool: |
|
"""Check if URL is allowed based on security settings""" |
|
from urllib.parse import urlparse |
|
|
|
try: |
|
parsed = urlparse(url) |
|
|
|
|
|
if parsed.scheme not in self.security.allowed_schemes: |
|
return False |
|
|
|
|
|
hostname = parsed.hostname or '' |
|
for blocked in self.security.blocked_domains: |
|
if blocked in hostname: |
|
return False |
|
|
|
|
|
if len(url) > self.security.max_url_length: |
|
return False |
|
|
|
return True |
|
|
|
except Exception: |
|
return False |
|
|
|
def get_request_headers(self) -> Dict[str, str]: |
|
"""Get standard request headers""" |
|
return { |
|
'User-Agent': self.scraping.user_agent, |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Accept-Encoding': 'gzip, deflate', |
|
'Connection': 'keep-alive', |
|
'Upgrade-Insecure-Requests': '1', |
|
} |
|
|
|
|
|
config = Config() |
|
|
|
|
|
if os.getenv('SPACE_ID'): |
|
|
|
config.models.device = "auto" |
|
config.security.requests_per_minute = 20 |
|
config.scraping.timeout = 10 |
|
|
|
|
|
if os.getenv('CUDA_VISIBLE_DEVICES'): |
|
config.models.device = "cuda" |
|
|
|
|
|
if os.getenv('ENVIRONMENT') == 'development': |
|
config.security.requests_per_minute = 100 |
|
config.scraping.timeout = 30 |
|
config.ui.show_advanced_options = True |