MagicMeWizard's picture
Update config.py
ccc5d44 verified
"""
⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
"""
import os
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class PerplexityConfig:
"""Configuration for Perplexity AI integration"""
# API Configuration
api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
base_url: str = "https://api.perplexity.ai"
model: str = "llama-3.1-sonar-large-128k-online"
# Rate Limiting
requests_per_minute: int = 30
request_timeout: int = 30
max_retries: int = 3
min_request_interval: float = 1.0 # seconds
# Search Configuration
default_max_sources: int = 20
max_sources_limit: int = 50
min_sources: int = 5
# Quality Thresholds
min_relevance_score: float = 3.0
min_content_length: int = 100
max_content_length: int = 10_000_000 # 10MB
# Search Templates
search_templates: Dict[str, str] = None
def __post_init__(self):
"""Initialize search templates after creation"""
if self.search_templates is None:
self.search_templates = {
"sentiment_analysis": """
Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:
PROJECT: {project_description}
REQUIREMENTS:
- Sources with clear positive, negative, or neutral sentiment
- Text suitable for sentiment classification training
- Diverse content types (reviews, social media, news, forums)
- Avoid heavily biased or extreme content
- Include metadata when possible (ratings, timestamps, etc.)
SEARCH FOCUS:
- Product reviews and customer feedback
- Social media posts and comments
- News articles with opinion content
- Blog posts with clear sentiment
- Forum discussions and community posts
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Why this source is good for sentiment analysis
4. **Content Type**: [review/social/news/blog/forum]
5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content
6. **Quality Score**: 1-10 rating for ML training suitability
""",
"text_classification": """
Find {max_sources} diverse, well-categorized sources for text classification training:
PROJECT: {project_description}
REQUIREMENTS:
- Sources with clear, distinct categories or topics
- Consistent content structure within categories
- Sufficient variety within each category
- Professional or semi-professional content quality
- Avoid overly niche or specialized content
SEARCH FOCUS:
- News articles with clear sections (politics, sports, technology, etc.)
- Academic papers with subject classifications
- E-commerce product descriptions with categories
- Blog posts with clear topical focus
- Government documents with departmental classifications
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content type and classification scheme
4. **Categories Available**: List of categories/classes present
5. **Content Volume**: Estimated amount of data per category
6. **Quality Score**: 1-10 rating for classification training
""",
"named_entity_recognition": """
Find {max_sources} text-rich sources with clear named entities for NER training:
PROJECT: {project_description}
REQUIREMENTS:
- Rich in named entities (people, places, organizations, dates, etc.)
- Clear, well-written text (not fragmented or poorly formatted)
- Diverse entity types and contexts
- Professional writing quality
- Entities are clearly identifiable in context
SEARCH FOCUS:
- News articles and press releases
- Biographical content and profiles
- Business and financial reports
- Historical documents and articles
- Academic papers and research
- Government publications
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Types of entities commonly found
4. **Entity Density**: Expected frequency of named entities
5. **Text Quality**: Assessment of writing clarity
6. **Quality Score**: 1-10 rating for NER training
""",
"question_answering": """
Find {max_sources} sources with clear question-answer patterns for QA training:
PROJECT: {project_description}
REQUIREMENTS:
- Explicit Q&A format OR clear factual content suitable for QA generation
- Questions and answers are clearly delineated
- Factual, verifiable information
- Diverse question types (factual, definitional, procedural, etc.)
- Professional quality content
SEARCH FOCUS:
- FAQ pages and help documentation
- Interview transcripts and Q&A sessions
- Educational content with questions
- Technical documentation with examples
- Customer support knowledge bases
- Stack Overflow and similar Q&A platforms
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Q&A format type and subject matter
4. **Question Types**: Types of questions typically found
5. **Answer Quality**: Assessment of answer completeness
6. **Quality Score**: 1-10 rating for QA training
""",
"text_summarization": """
Find {max_sources} sources with substantial, well-structured content for summarization training:
PROJECT: {project_description}
REQUIREMENTS:
- Long-form content (articles, reports, papers)
- Clear structure with main points
- Professional writing quality
- Self-contained content (doesn't rely heavily on external references)
- Diverse content types and subjects
SEARCH FOCUS:
- News articles and investigative reports
- Research papers and academic articles
- Long-form blog posts and essays
- Government reports and white papers
- Industry analysis and market reports
- Review articles and meta-analyses
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content length and structure
4. **Main Topics**: Key subjects covered
5. **Summarization Potential**: How well-suited for summary generation
6. **Quality Score**: 1-10 rating for summarization training
""",
"translation": """
Find {max_sources} parallel or multilingual content for translation training:
PROJECT: {project_description}
REQUIREMENTS:
- Content available in multiple languages
- High translation quality (professional or native-level)
- Parallel content alignment when possible
- Diverse domains and text types
- Clear source and target language identification
SEARCH FOCUS:
- Multilingual news websites
- International organization publications
- Government documents in multiple languages
- Educational content with translations
- Software documentation with localization
- Cultural and literary translations
OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Languages available and content type
4. **Language Pairs**: Specific language combinations
5. **Translation Quality**: Assessment of translation accuracy
6. **Quality Score**: 1-10 rating for translation training
"""
}
@dataclass
class ScrapingConfig:
"""Configuration for web scraping"""
# Request settings
timeout: int = 15
max_retries: int = 3
retry_delay: float = 1.0
# Rate limiting
requests_per_second: float = 0.5 # Conservative rate limiting
burst_requests: int = 5
# Content filtering
min_content_length: int = 100
max_content_length: int = 1_000_000 # 1MB per page
# User agent rotation
user_agents: List[str] = None
# Blocked domains (respect robots.txt)
blocked_domains: List[str] = None
# Content extraction settings
extract_metadata: bool = True
clean_html: bool = True
preserve_structure: bool = False
def __post_init__(self):
"""Initialize default values"""
if self.user_agents is None:
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]
if self.blocked_domains is None:
self.blocked_domains = [
'localhost',
'127.0.0.1',
'0.0.0.0',
'10.',
'172.',
'192.168.',
'internal.',
'staging.',
'test.',
'dev.'
]
@dataclass
class ModelConfig:
"""Configuration for AI models"""
# Model selection
sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
summarization_model: str = "facebook/bart-large-cnn"
ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"
# Fallback models (lighter/faster)
sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
ner_fallback: str = "distilbert-base-cased"
# Device configuration
device: str = "auto" # auto, cpu, cuda
use_gpu: bool = True
max_memory_mb: int = 4000
# Processing settings
max_sequence_length: int = 512
batch_size: int = 8
confidence_threshold: float = 0.7
# Cache settings
cache_models: bool = True
model_cache_dir: str = "./model_cache"
@dataclass
class ExportConfig:
"""Configuration for dataset export"""
# File settings
max_file_size_mb: int = 100
compression: bool = True
encoding: str = "utf-8"
# Format-specific settings
json_indent: int = 2
csv_delimiter: str = ","
csv_quoting: int = 1 # csv.QUOTE_ALL
# HuggingFace dataset settings
hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
hf_private: bool = True
hf_token: Optional[str] = os.getenv('HF_TOKEN')
# Metadata inclusion
include_source_urls: bool = True
include_timestamps: bool = True
include_processing_info: bool = True
include_confidence_scores: bool = True
@dataclass
class SecurityConfig:
"""Security and safety configuration"""
# URL validation
allow_local_urls: bool = False
allow_private_ips: bool = False
max_redirects: int = 5
# Content filtering
filter_adult_content: bool = True
filter_spam: bool = True
max_duplicate_content: float = 0.8 # Similarity threshold
# Rate limiting enforcement
enforce_rate_limits: bool = True
respect_robots_txt: bool = True
# Safety checks
scan_for_malware: bool = False # Requires additional dependencies
validate_ssl: bool = True
@dataclass
class UIConfig:
"""User interface configuration"""
# Theme settings
theme: str = "soft"
custom_css: bool = True
dark_mode: bool = False
# Interface settings
max_preview_items: int = 10
preview_text_length: int = 200
show_progress_bars: bool = True
# Advanced features
enable_debug_mode: bool = False
show_model_info: bool = True
enable_export_preview: bool = True
# Global configuration instance
class Config:
"""Main configuration class combining all settings"""
def __init__(self):
self.perplexity = PerplexityConfig()
self.scraping = ScrapingConfig()
self.models = ModelConfig()
self.export = ExportConfig()
self.security = SecurityConfig()
self.ui = UIConfig()
# Application settings
self.app_name = "AI Dataset Studio"
self.version = "2.0.0"
self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
# Logging
self.log_level = os.getenv('LOG_LEVEL', 'INFO')
self.log_format = '%(asctime)s - %(levelname)s - %(message)s'
def is_perplexity_enabled(self) -> bool:
"""Check if Perplexity AI is properly configured"""
return bool(self.perplexity.api_key)
def get_search_template(self, template_type: str, **kwargs) -> str:
"""Get formatted search template for Perplexity"""
template = self.perplexity.search_templates.get(template_type, "")
if template:
return template.format(**kwargs)
return ""
def validate_url(self, url: str) -> bool:
"""Validate URL against security settings"""
from urllib.parse import urlparse
try:
parsed = urlparse(url)
# Check scheme
if parsed.scheme not in ['http', 'https']:
return False
# Check for blocked domains
netloc = parsed.netloc.lower()
for blocked in self.security.blocked_domains:
if blocked in netloc:
return False
# Check for local/private IPs if not allowed
if not self.security.allow_local_urls:
if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
return False
if not self.security.allow_private_ips:
if any(private in netloc for private in ['10.', '172.', '192.168.']):
return False
return True
except Exception:
return False
# Create global config instance
config = Config()
# Export commonly used configurations
PERPLEXITY_CONFIG = config.perplexity
SCRAPING_CONFIG = config.scraping
MODEL_CONFIG = config.models
EXPORT_CONFIG = config.export
SECURITY_CONFIG = config.security
UI_CONFIG = config.ui