|
""" |
|
⚙️ Configuration settings for AI Dataset Studio with Perplexity integration |
|
""" |
|
|
|
import os |
|
from dataclasses import dataclass |
|
from typing import List, Dict, Optional |
|
|
|
@dataclass |
|
class PerplexityConfig: |
|
"""Configuration for Perplexity AI integration""" |
|
|
|
|
|
api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY') |
|
base_url: str = "https://api.perplexity.ai" |
|
model: str = "llama-3.1-sonar-large-128k-online" |
|
|
|
|
|
requests_per_minute: int = 30 |
|
request_timeout: int = 30 |
|
max_retries: int = 3 |
|
min_request_interval: float = 1.0 |
|
|
|
|
|
default_max_sources: int = 20 |
|
max_sources_limit: int = 50 |
|
min_sources: int = 5 |
|
|
|
|
|
min_relevance_score: float = 3.0 |
|
min_content_length: int = 100 |
|
max_content_length: int = 10_000_000 |
|
|
|
|
|
search_templates: Dict[str, str] = None |
|
|
|
def __post_init__(self): |
|
"""Initialize search templates after creation""" |
|
if self.search_templates is None: |
|
self.search_templates = { |
|
"sentiment_analysis": """ |
|
Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Sources with clear positive, negative, or neutral sentiment |
|
- Text suitable for sentiment classification training |
|
- Diverse content types (reviews, social media, news, forums) |
|
- Avoid heavily biased or extreme content |
|
- Include metadata when possible (ratings, timestamps, etc.) |
|
|
|
SEARCH FOCUS: |
|
- Product reviews and customer feedback |
|
- Social media posts and comments |
|
- News articles with opinion content |
|
- Blog posts with clear sentiment |
|
- Forum discussions and community posts |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Why this source is good for sentiment analysis |
|
4. **Content Type**: [review/social/news/blog/forum] |
|
5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content |
|
6. **Quality Score**: 1-10 rating for ML training suitability |
|
""", |
|
|
|
"text_classification": """ |
|
Find {max_sources} diverse, well-categorized sources for text classification training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Sources with clear, distinct categories or topics |
|
- Consistent content structure within categories |
|
- Sufficient variety within each category |
|
- Professional or semi-professional content quality |
|
- Avoid overly niche or specialized content |
|
|
|
SEARCH FOCUS: |
|
- News articles with clear sections (politics, sports, technology, etc.) |
|
- Academic papers with subject classifications |
|
- E-commerce product descriptions with categories |
|
- Blog posts with clear topical focus |
|
- Government documents with departmental classifications |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Content type and classification scheme |
|
4. **Categories Available**: List of categories/classes present |
|
5. **Content Volume**: Estimated amount of data per category |
|
6. **Quality Score**: 1-10 rating for classification training |
|
""", |
|
|
|
"named_entity_recognition": """ |
|
Find {max_sources} text-rich sources with clear named entities for NER training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Rich in named entities (people, places, organizations, dates, etc.) |
|
- Clear, well-written text (not fragmented or poorly formatted) |
|
- Diverse entity types and contexts |
|
- Professional writing quality |
|
- Entities are clearly identifiable in context |
|
|
|
SEARCH FOCUS: |
|
- News articles and press releases |
|
- Biographical content and profiles |
|
- Business and financial reports |
|
- Historical documents and articles |
|
- Academic papers and research |
|
- Government publications |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Types of entities commonly found |
|
4. **Entity Density**: Expected frequency of named entities |
|
5. **Text Quality**: Assessment of writing clarity |
|
6. **Quality Score**: 1-10 rating for NER training |
|
""", |
|
|
|
"question_answering": """ |
|
Find {max_sources} sources with clear question-answer patterns for QA training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Explicit Q&A format OR clear factual content suitable for QA generation |
|
- Questions and answers are clearly delineated |
|
- Factual, verifiable information |
|
- Diverse question types (factual, definitional, procedural, etc.) |
|
- Professional quality content |
|
|
|
SEARCH FOCUS: |
|
- FAQ pages and help documentation |
|
- Interview transcripts and Q&A sessions |
|
- Educational content with questions |
|
- Technical documentation with examples |
|
- Customer support knowledge bases |
|
- Stack Overflow and similar Q&A platforms |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Q&A format type and subject matter |
|
4. **Question Types**: Types of questions typically found |
|
5. **Answer Quality**: Assessment of answer completeness |
|
6. **Quality Score**: 1-10 rating for QA training |
|
""", |
|
|
|
"text_summarization": """ |
|
Find {max_sources} sources with substantial, well-structured content for summarization training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Long-form content (articles, reports, papers) |
|
- Clear structure with main points |
|
- Professional writing quality |
|
- Self-contained content (doesn't rely heavily on external references) |
|
- Diverse content types and subjects |
|
|
|
SEARCH FOCUS: |
|
- News articles and investigative reports |
|
- Research papers and academic articles |
|
- Long-form blog posts and essays |
|
- Government reports and white papers |
|
- Industry analysis and market reports |
|
- Review articles and meta-analyses |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Content length and structure |
|
4. **Main Topics**: Key subjects covered |
|
5. **Summarization Potential**: How well-suited for summary generation |
|
6. **Quality Score**: 1-10 rating for summarization training |
|
""", |
|
|
|
"translation": """ |
|
Find {max_sources} parallel or multilingual content for translation training: |
|
|
|
PROJECT: {project_description} |
|
|
|
REQUIREMENTS: |
|
- Content available in multiple languages |
|
- High translation quality (professional or native-level) |
|
- Parallel content alignment when possible |
|
- Diverse domains and text types |
|
- Clear source and target language identification |
|
|
|
SEARCH FOCUS: |
|
- Multilingual news websites |
|
- International organization publications |
|
- Government documents in multiple languages |
|
- Educational content with translations |
|
- Software documentation with localization |
|
- Cultural and literary translations |
|
|
|
OUTPUT FORMAT: |
|
For each source provide: |
|
1. **URL**: Direct link to content |
|
2. **Title**: Clear, descriptive title |
|
3. **Description**: Languages available and content type |
|
4. **Language Pairs**: Specific language combinations |
|
5. **Translation Quality**: Assessment of translation accuracy |
|
6. **Quality Score**: 1-10 rating for translation training |
|
""" |
|
} |
|
|
|
@dataclass |
|
class ScrapingConfig: |
|
"""Configuration for web scraping""" |
|
|
|
|
|
timeout: int = 15 |
|
max_retries: int = 3 |
|
retry_delay: float = 1.0 |
|
|
|
|
|
requests_per_second: float = 0.5 |
|
burst_requests: int = 5 |
|
|
|
|
|
min_content_length: int = 100 |
|
max_content_length: int = 1_000_000 |
|
|
|
|
|
user_agents: List[str] = None |
|
|
|
|
|
blocked_domains: List[str] = None |
|
|
|
|
|
extract_metadata: bool = True |
|
clean_html: bool = True |
|
preserve_structure: bool = False |
|
|
|
def __post_init__(self): |
|
"""Initialize default values""" |
|
if self.user_agents is None: |
|
self.user_agents = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
] |
|
|
|
if self.blocked_domains is None: |
|
self.blocked_domains = [ |
|
'localhost', |
|
'127.0.0.1', |
|
'0.0.0.0', |
|
'10.', |
|
'172.', |
|
'192.168.', |
|
'internal.', |
|
'staging.', |
|
'test.', |
|
'dev.' |
|
] |
|
|
|
@dataclass |
|
class ModelConfig: |
|
"""Configuration for AI models""" |
|
|
|
|
|
sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest" |
|
summarization_model: str = "facebook/bart-large-cnn" |
|
ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english" |
|
|
|
|
|
sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english" |
|
summarization_fallback: str = "sshleifer/distilbart-cnn-12-6" |
|
ner_fallback: str = "distilbert-base-cased" |
|
|
|
|
|
device: str = "auto" |
|
use_gpu: bool = True |
|
max_memory_mb: int = 4000 |
|
|
|
|
|
max_sequence_length: int = 512 |
|
batch_size: int = 8 |
|
confidence_threshold: float = 0.7 |
|
|
|
|
|
cache_models: bool = True |
|
model_cache_dir: str = "./model_cache" |
|
|
|
@dataclass |
|
class ExportConfig: |
|
"""Configuration for dataset export""" |
|
|
|
|
|
max_file_size_mb: int = 100 |
|
compression: bool = True |
|
encoding: str = "utf-8" |
|
|
|
|
|
json_indent: int = 2 |
|
csv_delimiter: str = "," |
|
csv_quoting: int = 1 |
|
|
|
|
|
hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}" |
|
hf_private: bool = True |
|
hf_token: Optional[str] = os.getenv('HF_TOKEN') |
|
|
|
|
|
include_source_urls: bool = True |
|
include_timestamps: bool = True |
|
include_processing_info: bool = True |
|
include_confidence_scores: bool = True |
|
|
|
@dataclass |
|
class SecurityConfig: |
|
"""Security and safety configuration""" |
|
|
|
|
|
allow_local_urls: bool = False |
|
allow_private_ips: bool = False |
|
max_redirects: int = 5 |
|
|
|
|
|
filter_adult_content: bool = True |
|
filter_spam: bool = True |
|
max_duplicate_content: float = 0.8 |
|
|
|
|
|
enforce_rate_limits: bool = True |
|
respect_robots_txt: bool = True |
|
|
|
|
|
scan_for_malware: bool = False |
|
validate_ssl: bool = True |
|
|
|
@dataclass |
|
class UIConfig: |
|
"""User interface configuration""" |
|
|
|
|
|
theme: str = "soft" |
|
custom_css: bool = True |
|
dark_mode: bool = False |
|
|
|
|
|
max_preview_items: int = 10 |
|
preview_text_length: int = 200 |
|
show_progress_bars: bool = True |
|
|
|
|
|
enable_debug_mode: bool = False |
|
show_model_info: bool = True |
|
enable_export_preview: bool = True |
|
|
|
|
|
class Config: |
|
"""Main configuration class combining all settings""" |
|
|
|
def __init__(self): |
|
self.perplexity = PerplexityConfig() |
|
self.scraping = ScrapingConfig() |
|
self.models = ModelConfig() |
|
self.export = ExportConfig() |
|
self.security = SecurityConfig() |
|
self.ui = UIConfig() |
|
|
|
|
|
self.app_name = "AI Dataset Studio" |
|
self.version = "2.0.0" |
|
self.debug = os.getenv('DEBUG', 'false').lower() == 'true' |
|
|
|
|
|
self.log_level = os.getenv('LOG_LEVEL', 'INFO') |
|
self.log_format = '%(asctime)s - %(levelname)s - %(message)s' |
|
|
|
def is_perplexity_enabled(self) -> bool: |
|
"""Check if Perplexity AI is properly configured""" |
|
return bool(self.perplexity.api_key) |
|
|
|
def get_search_template(self, template_type: str, **kwargs) -> str: |
|
"""Get formatted search template for Perplexity""" |
|
template = self.perplexity.search_templates.get(template_type, "") |
|
if template: |
|
return template.format(**kwargs) |
|
return "" |
|
|
|
def validate_url(self, url: str) -> bool: |
|
"""Validate URL against security settings""" |
|
from urllib.parse import urlparse |
|
|
|
try: |
|
parsed = urlparse(url) |
|
|
|
|
|
if parsed.scheme not in ['http', 'https']: |
|
return False |
|
|
|
|
|
netloc = parsed.netloc.lower() |
|
for blocked in self.security.blocked_domains: |
|
if blocked in netloc: |
|
return False |
|
|
|
|
|
if not self.security.allow_local_urls: |
|
if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']): |
|
return False |
|
|
|
if not self.security.allow_private_ips: |
|
if any(private in netloc for private in ['10.', '172.', '192.168.']): |
|
return False |
|
|
|
return True |
|
|
|
except Exception: |
|
return False |
|
|
|
|
|
config = Config() |
|
|
|
|
|
PERPLEXITY_CONFIG = config.perplexity |
|
SCRAPING_CONFIG = config.scraping |
|
MODEL_CONFIG = config.models |
|
EXPORT_CONFIG = config.export |
|
SECURITY_CONFIG = config.security |
|
UI_CONFIG = config.ui |