"""
⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
"""

import os
from dataclasses import dataclass
from typing import List, Dict, Optional

@dataclass
class PerplexityConfig:
    """Configuration for Perplexity AI integration"""
    
    # API Configuration
    api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
    base_url: str = "https://api.perplexity.ai"
    model: str = "llama-3.1-sonar-large-128k-online"
    
    # Rate Limiting
    requests_per_minute: int = 30
    request_timeout: int = 30
    max_retries: int = 3
    min_request_interval: float = 1.0  # seconds
    
    # Search Configuration
    default_max_sources: int = 20
    max_sources_limit: int = 50
    min_sources: int = 5
    
    # Quality Thresholds
    min_relevance_score: float = 3.0
    min_content_length: int = 100
    max_content_length: int = 10_000_000  # 10MB
    
    # Search Templates
    search_templates: Dict[str, str] = None
    
    def __post_init__(self):
        """Initialize search templates after creation"""
        if self.search_templates is None:
            self.search_templates = {
                "sentiment_analysis": """
Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear positive, negative, or neutral sentiment
- Text suitable for sentiment classification training
- Diverse content types (reviews, social media, news, forums)
- Avoid heavily biased or extreme content
- Include metadata when possible (ratings, timestamps, etc.)

SEARCH FOCUS:
- Product reviews and customer feedback
- Social media posts and comments
- News articles with opinion content
- Blog posts with clear sentiment
- Forum discussions and community posts

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Why this source is good for sentiment analysis
4. **Content Type**: [review/social/news/blog/forum]
5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content
6. **Quality Score**: 1-10 rating for ML training suitability
""",
                
                "text_classification": """
Find {max_sources} diverse, well-categorized sources for text classification training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear, distinct categories or topics
- Consistent content structure within categories
- Sufficient variety within each category
- Professional or semi-professional content quality
- Avoid overly niche or specialized content

SEARCH FOCUS:
- News articles with clear sections (politics, sports, technology, etc.)
- Academic papers with subject classifications
- E-commerce product descriptions with categories
- Blog posts with clear topical focus
- Government documents with departmental classifications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content type and classification scheme
4. **Categories Available**: List of categories/classes present
5. **Content Volume**: Estimated amount of data per category
6. **Quality Score**: 1-10 rating for classification training
""",
                
                "named_entity_recognition": """
Find {max_sources} text-rich sources with clear named entities for NER training:

PROJECT: {project_description}

REQUIREMENTS:
- Rich in named entities (people, places, organizations, dates, etc.)
- Clear, well-written text (not fragmented or poorly formatted)
- Diverse entity types and contexts
- Professional writing quality
- Entities are clearly identifiable in context

SEARCH FOCUS:
- News articles and press releases
- Biographical content and profiles
- Business and financial reports
- Historical documents and articles
- Academic papers and research
- Government publications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Types of entities commonly found
4. **Entity Density**: Expected frequency of named entities
5. **Text Quality**: Assessment of writing clarity
6. **Quality Score**: 1-10 rating for NER training
""",
                
                "question_answering": """
Find {max_sources} sources with clear question-answer patterns for QA training:

PROJECT: {project_description}

REQUIREMENTS:
- Explicit Q&A format OR clear factual content suitable for QA generation
- Questions and answers are clearly delineated
- Factual, verifiable information
- Diverse question types (factual, definitional, procedural, etc.)
- Professional quality content

SEARCH FOCUS:
- FAQ pages and help documentation
- Interview transcripts and Q&A sessions
- Educational content with questions
- Technical documentation with examples
- Customer support knowledge bases
- Stack Overflow and similar Q&A platforms

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Q&A format type and subject matter
4. **Question Types**: Types of questions typically found
5. **Answer Quality**: Assessment of answer completeness
6. **Quality Score**: 1-10 rating for QA training
""",
                
                "text_summarization": """
Find {max_sources} sources with substantial, well-structured content for summarization training:

PROJECT: {project_description}

REQUIREMENTS:
- Long-form content (articles, reports, papers)
- Clear structure with main points
- Professional writing quality
- Self-contained content (doesn't rely heavily on external references)
- Diverse content types and subjects

SEARCH FOCUS:
- News articles and investigative reports
- Research papers and academic articles
- Long-form blog posts and essays
- Government reports and white papers
- Industry analysis and market reports
- Review articles and meta-analyses

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content length and structure
4. **Main Topics**: Key subjects covered
5. **Summarization Potential**: How well-suited for summary generation
6. **Quality Score**: 1-10 rating for summarization training
""",
                
                "translation": """
Find {max_sources} parallel or multilingual content for translation training:

PROJECT: {project_description}

REQUIREMENTS:
- Content available in multiple languages
- High translation quality (professional or native-level)
- Parallel content alignment when possible
- Diverse domains and text types
- Clear source and target language identification

SEARCH FOCUS:
- Multilingual news websites
- International organization publications
- Government documents in multiple languages
- Educational content with translations
- Software documentation with localization
- Cultural and literary translations

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Languages available and content type
4. **Language Pairs**: Specific language combinations
5. **Translation Quality**: Assessment of translation accuracy
6. **Quality Score**: 1-10 rating for translation training
"""
            }

@dataclass
class ScrapingConfig:
    """Configuration for web scraping"""
    
    # Request settings
    timeout: int = 15
    max_retries: int = 3
    retry_delay: float = 1.0
    
    # Rate limiting
    requests_per_second: float = 0.5  # Conservative rate limiting
    burst_requests: int = 5
    
    # Content filtering
    min_content_length: int = 100
    max_content_length: int = 1_000_000  # 1MB per page
    
    # User agent rotation
    user_agents: List[str] = None
    
    # Blocked domains (respect robots.txt)
    blocked_domains: List[str] = None
    
    # Content extraction settings
    extract_metadata: bool = True
    clean_html: bool = True
    preserve_structure: bool = False
    
    def __post_init__(self):
        """Initialize default values"""
        if self.user_agents is None:
            self.user_agents = [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            ]
        
        if self.blocked_domains is None:
            self.blocked_domains = [
                'localhost',
                '127.0.0.1',
                '0.0.0.0',
                '10.',
                '172.',
                '192.168.',
                'internal.',
                'staging.',
                'test.',
                'dev.'
            ]

@dataclass
class ModelConfig:
    """Configuration for AI models"""
    
    # Model selection
    sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    summarization_model: str = "facebook/bart-large-cnn"
    ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"
    
    # Fallback models (lighter/faster)
    sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
    summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
    ner_fallback: str = "distilbert-base-cased"
    
    # Device configuration
    device: str = "auto"  # auto, cpu, cuda
    use_gpu: bool = True
    max_memory_mb: int = 4000
    
    # Processing settings
    max_sequence_length: int = 512
    batch_size: int = 8
    confidence_threshold: float = 0.7
    
    # Cache settings
    cache_models: bool = True
    model_cache_dir: str = "./model_cache"

@dataclass
class ExportConfig:
    """Configuration for dataset export"""
    
    # File settings
    max_file_size_mb: int = 100
    compression: bool = True
    encoding: str = "utf-8"
    
    # Format-specific settings
    json_indent: int = 2
    csv_delimiter: str = ","
    csv_quoting: int = 1  # csv.QUOTE_ALL
    
    # HuggingFace dataset settings
    hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
    hf_private: bool = True
    hf_token: Optional[str] = os.getenv('HF_TOKEN')
    
    # Metadata inclusion
    include_source_urls: bool = True
    include_timestamps: bool = True
    include_processing_info: bool = True
    include_confidence_scores: bool = True

@dataclass
class SecurityConfig:
    """Security and safety configuration"""
    
    # URL validation
    allow_local_urls: bool = False
    allow_private_ips: bool = False
    max_redirects: int = 5
    
    # Content filtering
    filter_adult_content: bool = True
    filter_spam: bool = True
    max_duplicate_content: float = 0.8  # Similarity threshold
    
    # Rate limiting enforcement
    enforce_rate_limits: bool = True
    respect_robots_txt: bool = True
    
    # Safety checks
    scan_for_malware: bool = False  # Requires additional dependencies
    validate_ssl: bool = True

@dataclass
class UIConfig:
    """User interface configuration"""
    
    # Theme settings
    theme: str = "soft"
    custom_css: bool = True
    dark_mode: bool = False
    
    # Interface settings
    max_preview_items: int = 10
    preview_text_length: int = 200
    show_progress_bars: bool = True
    
    # Advanced features
    enable_debug_mode: bool = False
    show_model_info: bool = True
    enable_export_preview: bool = True

# Global configuration instance
class Config:
    """Main configuration class combining all settings"""
    
    def __init__(self):
        self.perplexity = PerplexityConfig()
        self.scraping = ScrapingConfig()
        self.models = ModelConfig()
        self.export = ExportConfig()
        self.security = SecurityConfig()
        self.ui = UIConfig()
        
        # Application settings
        self.app_name = "AI Dataset Studio"
        self.version = "2.0.0"
        self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
        
        # Logging
        self.log_level = os.getenv('LOG_LEVEL', 'INFO')
        self.log_format = '%(asctime)s - %(levelname)s - %(message)s'
    
    def is_perplexity_enabled(self) -> bool:
        """Check if Perplexity AI is properly configured"""
        return bool(self.perplexity.api_key)
    
    def get_search_template(self, template_type: str, **kwargs) -> str:
        """Get formatted search template for Perplexity"""
        template = self.perplexity.search_templates.get(template_type, "")
        if template:
            return template.format(**kwargs)
        return ""
    
    def validate_url(self, url: str) -> bool:
        """Validate URL against security settings"""
        from urllib.parse import urlparse
        
        try:
            parsed = urlparse(url)
            
            # Check scheme
            if parsed.scheme not in ['http', 'https']:
                return False
            
            # Check for blocked domains
            netloc = parsed.netloc.lower()
            for blocked in self.security.blocked_domains:
                if blocked in netloc:
                    return False
            
            # Check for local/private IPs if not allowed
            if not self.security.allow_local_urls:
                if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
                    return False
            
            if not self.security.allow_private_ips:
                if any(private in netloc for private in ['10.', '172.', '192.168.']):
                    return False
            
            return True
            
        except Exception:
            return False

# Create global config instance
config = Config()

# Export commonly used configurations
PERPLEXITY_CONFIG = config.perplexity
SCRAPING_CONFIG = config.scraping
MODEL_CONFIG = config.models
EXPORT_CONFIG = config.export
SECURITY_CONFIG = config.security
UI_CONFIG = config.ui