""" ⚙️ Configuration settings for AI Dataset Studio with Perplexity integration """ import os from dataclasses import dataclass from typing import List, Dict, Optional @dataclass class PerplexityConfig: """Configuration for Perplexity AI integration""" # API Configuration api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY') base_url: str = "https://api.perplexity.ai" model: str = "llama-3.1-sonar-large-128k-online" # Rate Limiting requests_per_minute: int = 30 request_timeout: int = 30 max_retries: int = 3 min_request_interval: float = 1.0 # seconds # Search Configuration default_max_sources: int = 20 max_sources_limit: int = 50 min_sources: int = 5 # Quality Thresholds min_relevance_score: float = 3.0 min_content_length: int = 100 max_content_length: int = 10_000_000 # 10MB # Search Templates search_templates: Dict[str, str] = None def __post_init__(self): """Initialize search templates after creation""" if self.search_templates is None: self.search_templates = { "sentiment_analysis": """ Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training: PROJECT: {project_description} REQUIREMENTS: - Sources with clear positive, negative, or neutral sentiment - Text suitable for sentiment classification training - Diverse content types (reviews, social media, news, forums) - Avoid heavily biased or extreme content - Include metadata when possible (ratings, timestamps, etc.) SEARCH FOCUS: - Product reviews and customer feedback - Social media posts and comments - News articles with opinion content - Blog posts with clear sentiment - Forum discussions and community posts OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Why this source is good for sentiment analysis 4. **Content Type**: [review/social/news/blog/forum] 5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content 6. **Quality Score**: 1-10 rating for ML training suitability """, "text_classification": """ Find {max_sources} diverse, well-categorized sources for text classification training: PROJECT: {project_description} REQUIREMENTS: - Sources with clear, distinct categories or topics - Consistent content structure within categories - Sufficient variety within each category - Professional or semi-professional content quality - Avoid overly niche or specialized content SEARCH FOCUS: - News articles with clear sections (politics, sports, technology, etc.) - Academic papers with subject classifications - E-commerce product descriptions with categories - Blog posts with clear topical focus - Government documents with departmental classifications OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Content type and classification scheme 4. **Categories Available**: List of categories/classes present 5. **Content Volume**: Estimated amount of data per category 6. **Quality Score**: 1-10 rating for classification training """, "named_entity_recognition": """ Find {max_sources} text-rich sources with clear named entities for NER training: PROJECT: {project_description} REQUIREMENTS: - Rich in named entities (people, places, organizations, dates, etc.) - Clear, well-written text (not fragmented or poorly formatted) - Diverse entity types and contexts - Professional writing quality - Entities are clearly identifiable in context SEARCH FOCUS: - News articles and press releases - Biographical content and profiles - Business and financial reports - Historical documents and articles - Academic papers and research - Government publications OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Types of entities commonly found 4. **Entity Density**: Expected frequency of named entities 5. **Text Quality**: Assessment of writing clarity 6. **Quality Score**: 1-10 rating for NER training """, "question_answering": """ Find {max_sources} sources with clear question-answer patterns for QA training: PROJECT: {project_description} REQUIREMENTS: - Explicit Q&A format OR clear factual content suitable for QA generation - Questions and answers are clearly delineated - Factual, verifiable information - Diverse question types (factual, definitional, procedural, etc.) - Professional quality content SEARCH FOCUS: - FAQ pages and help documentation - Interview transcripts and Q&A sessions - Educational content with questions - Technical documentation with examples - Customer support knowledge bases - Stack Overflow and similar Q&A platforms OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Q&A format type and subject matter 4. **Question Types**: Types of questions typically found 5. **Answer Quality**: Assessment of answer completeness 6. **Quality Score**: 1-10 rating for QA training """, "text_summarization": """ Find {max_sources} sources with substantial, well-structured content for summarization training: PROJECT: {project_description} REQUIREMENTS: - Long-form content (articles, reports, papers) - Clear structure with main points - Professional writing quality - Self-contained content (doesn't rely heavily on external references) - Diverse content types and subjects SEARCH FOCUS: - News articles and investigative reports - Research papers and academic articles - Long-form blog posts and essays - Government reports and white papers - Industry analysis and market reports - Review articles and meta-analyses OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Content length and structure 4. **Main Topics**: Key subjects covered 5. **Summarization Potential**: How well-suited for summary generation 6. **Quality Score**: 1-10 rating for summarization training """, "translation": """ Find {max_sources} parallel or multilingual content for translation training: PROJECT: {project_description} REQUIREMENTS: - Content available in multiple languages - High translation quality (professional or native-level) - Parallel content alignment when possible - Diverse domains and text types - Clear source and target language identification SEARCH FOCUS: - Multilingual news websites - International organization publications - Government documents in multiple languages - Educational content with translations - Software documentation with localization - Cultural and literary translations OUTPUT FORMAT: For each source provide: 1. **URL**: Direct link to content 2. **Title**: Clear, descriptive title 3. **Description**: Languages available and content type 4. **Language Pairs**: Specific language combinations 5. **Translation Quality**: Assessment of translation accuracy 6. **Quality Score**: 1-10 rating for translation training """ } @dataclass class ScrapingConfig: """Configuration for web scraping""" # Request settings timeout: int = 15 max_retries: int = 3 retry_delay: float = 1.0 # Rate limiting requests_per_second: float = 0.5 # Conservative rate limiting burst_requests: int = 5 # Content filtering min_content_length: int = 100 max_content_length: int = 1_000_000 # 1MB per page # User agent rotation user_agents: List[str] = None # Blocked domains (respect robots.txt) blocked_domains: List[str] = None # Content extraction settings extract_metadata: bool = True clean_html: bool = True preserve_structure: bool = False def __post_init__(self): """Initialize default values""" if self.user_agents is None: self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ] if self.blocked_domains is None: self.blocked_domains = [ 'localhost', '127.0.0.1', '0.0.0.0', '10.', '172.', '192.168.', 'internal.', 'staging.', 'test.', 'dev.' ] @dataclass class ModelConfig: """Configuration for AI models""" # Model selection sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest" summarization_model: str = "facebook/bart-large-cnn" ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english" # Fallback models (lighter/faster) sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english" summarization_fallback: str = "sshleifer/distilbart-cnn-12-6" ner_fallback: str = "distilbert-base-cased" # Device configuration device: str = "auto" # auto, cpu, cuda use_gpu: bool = True max_memory_mb: int = 4000 # Processing settings max_sequence_length: int = 512 batch_size: int = 8 confidence_threshold: float = 0.7 # Cache settings cache_models: bool = True model_cache_dir: str = "./model_cache" @dataclass class ExportConfig: """Configuration for dataset export""" # File settings max_file_size_mb: int = 100 compression: bool = True encoding: str = "utf-8" # Format-specific settings json_indent: int = 2 csv_delimiter: str = "," csv_quoting: int = 1 # csv.QUOTE_ALL # HuggingFace dataset settings hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}" hf_private: bool = True hf_token: Optional[str] = os.getenv('HF_TOKEN') # Metadata inclusion include_source_urls: bool = True include_timestamps: bool = True include_processing_info: bool = True include_confidence_scores: bool = True @dataclass class SecurityConfig: """Security and safety configuration""" # URL validation allow_local_urls: bool = False allow_private_ips: bool = False max_redirects: int = 5 # Content filtering filter_adult_content: bool = True filter_spam: bool = True max_duplicate_content: float = 0.8 # Similarity threshold # Rate limiting enforcement enforce_rate_limits: bool = True respect_robots_txt: bool = True # Safety checks scan_for_malware: bool = False # Requires additional dependencies validate_ssl: bool = True @dataclass class UIConfig: """User interface configuration""" # Theme settings theme: str = "soft" custom_css: bool = True dark_mode: bool = False # Interface settings max_preview_items: int = 10 preview_text_length: int = 200 show_progress_bars: bool = True # Advanced features enable_debug_mode: bool = False show_model_info: bool = True enable_export_preview: bool = True # Global configuration instance class Config: """Main configuration class combining all settings""" def __init__(self): self.perplexity = PerplexityConfig() self.scraping = ScrapingConfig() self.models = ModelConfig() self.export = ExportConfig() self.security = SecurityConfig() self.ui = UIConfig() # Application settings self.app_name = "AI Dataset Studio" self.version = "2.0.0" self.debug = os.getenv('DEBUG', 'false').lower() == 'true' # Logging self.log_level = os.getenv('LOG_LEVEL', 'INFO') self.log_format = '%(asctime)s - %(levelname)s - %(message)s' def is_perplexity_enabled(self) -> bool: """Check if Perplexity AI is properly configured""" return bool(self.perplexity.api_key) def get_search_template(self, template_type: str, **kwargs) -> str: """Get formatted search template for Perplexity""" template = self.perplexity.search_templates.get(template_type, "") if template: return template.format(**kwargs) return "" def validate_url(self, url: str) -> bool: """Validate URL against security settings""" from urllib.parse import urlparse try: parsed = urlparse(url) # Check scheme if parsed.scheme not in ['http', 'https']: return False # Check for blocked domains netloc = parsed.netloc.lower() for blocked in self.security.blocked_domains: if blocked in netloc: return False # Check for local/private IPs if not allowed if not self.security.allow_local_urls: if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']): return False if not self.security.allow_private_ips: if any(private in netloc for private in ['10.', '172.', '192.168.']): return False return True except Exception: return False # Create global config instance config = Config() # Export commonly used configurations PERPLEXITY_CONFIG = config.perplexity SCRAPING_CONFIG = config.scraping MODEL_CONFIG = config.models EXPORT_CONFIG = config.export SECURITY_CONFIG = config.security UI_CONFIG = config.ui