File size: 14,105 Bytes
f9f65ef
ccc5d44
f9f65ef
 
 
 
ccc5d44
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
f9f65ef
 
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
ccc5d44
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
 
ccc5d44
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
 
 
 
 
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
f9f65ef
ccc5d44
f9f65ef
 
ccc5d44
f9f65ef
ccc5d44
 
 
f9f65ef
 
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
 
 
 
ccc5d44
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
f9f65ef
 
ccc5d44
 
 
 
 
 
 
 
f9f65ef
 
 
 
 
 
ccc5d44
f9f65ef
 
ccc5d44
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
"""
⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
"""

import os
from dataclasses import dataclass
from typing import List, Dict, Optional

@dataclass
class PerplexityConfig:
    """Configuration for Perplexity AI integration"""
    
    # API Configuration
    api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
    base_url: str = "https://api.perplexity.ai"
    model: str = "llama-3.1-sonar-large-128k-online"
    
    # Rate Limiting
    requests_per_minute: int = 30
    request_timeout: int = 30
    max_retries: int = 3
    min_request_interval: float = 1.0  # seconds
    
    # Search Configuration
    default_max_sources: int = 20
    max_sources_limit: int = 50
    min_sources: int = 5
    
    # Quality Thresholds
    min_relevance_score: float = 3.0
    min_content_length: int = 100
    max_content_length: int = 10_000_000  # 10MB
    
    # Search Templates
    search_templates: Dict[str, str] = None
    
    def __post_init__(self):
        """Initialize search templates after creation"""
        if self.search_templates is None:
            self.search_templates = {
                "sentiment_analysis": """
Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear positive, negative, or neutral sentiment
- Text suitable for sentiment classification training
- Diverse content types (reviews, social media, news, forums)
- Avoid heavily biased or extreme content
- Include metadata when possible (ratings, timestamps, etc.)

SEARCH FOCUS:
- Product reviews and customer feedback
- Social media posts and comments
- News articles with opinion content
- Blog posts with clear sentiment
- Forum discussions and community posts

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Why this source is good for sentiment analysis
4. **Content Type**: [review/social/news/blog/forum]
5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content
6. **Quality Score**: 1-10 rating for ML training suitability
""",
                
                "text_classification": """
Find {max_sources} diverse, well-categorized sources for text classification training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear, distinct categories or topics
- Consistent content structure within categories
- Sufficient variety within each category
- Professional or semi-professional content quality
- Avoid overly niche or specialized content

SEARCH FOCUS:
- News articles with clear sections (politics, sports, technology, etc.)
- Academic papers with subject classifications
- E-commerce product descriptions with categories
- Blog posts with clear topical focus
- Government documents with departmental classifications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content type and classification scheme
4. **Categories Available**: List of categories/classes present
5. **Content Volume**: Estimated amount of data per category
6. **Quality Score**: 1-10 rating for classification training
""",
                
                "named_entity_recognition": """
Find {max_sources} text-rich sources with clear named entities for NER training:

PROJECT: {project_description}

REQUIREMENTS:
- Rich in named entities (people, places, organizations, dates, etc.)
- Clear, well-written text (not fragmented or poorly formatted)
- Diverse entity types and contexts
- Professional writing quality
- Entities are clearly identifiable in context

SEARCH FOCUS:
- News articles and press releases
- Biographical content and profiles
- Business and financial reports
- Historical documents and articles
- Academic papers and research
- Government publications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Types of entities commonly found
4. **Entity Density**: Expected frequency of named entities
5. **Text Quality**: Assessment of writing clarity
6. **Quality Score**: 1-10 rating for NER training
""",
                
                "question_answering": """
Find {max_sources} sources with clear question-answer patterns for QA training:

PROJECT: {project_description}

REQUIREMENTS:
- Explicit Q&A format OR clear factual content suitable for QA generation
- Questions and answers are clearly delineated
- Factual, verifiable information
- Diverse question types (factual, definitional, procedural, etc.)
- Professional quality content

SEARCH FOCUS:
- FAQ pages and help documentation
- Interview transcripts and Q&A sessions
- Educational content with questions
- Technical documentation with examples
- Customer support knowledge bases
- Stack Overflow and similar Q&A platforms

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Q&A format type and subject matter
4. **Question Types**: Types of questions typically found
5. **Answer Quality**: Assessment of answer completeness
6. **Quality Score**: 1-10 rating for QA training
""",
                
                "text_summarization": """
Find {max_sources} sources with substantial, well-structured content for summarization training:

PROJECT: {project_description}

REQUIREMENTS:
- Long-form content (articles, reports, papers)
- Clear structure with main points
- Professional writing quality
- Self-contained content (doesn't rely heavily on external references)
- Diverse content types and subjects

SEARCH FOCUS:
- News articles and investigative reports
- Research papers and academic articles
- Long-form blog posts and essays
- Government reports and white papers
- Industry analysis and market reports
- Review articles and meta-analyses

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content length and structure
4. **Main Topics**: Key subjects covered
5. **Summarization Potential**: How well-suited for summary generation
6. **Quality Score**: 1-10 rating for summarization training
""",
                
                "translation": """
Find {max_sources} parallel or multilingual content for translation training:

PROJECT: {project_description}

REQUIREMENTS:
- Content available in multiple languages
- High translation quality (professional or native-level)
- Parallel content alignment when possible
- Diverse domains and text types
- Clear source and target language identification

SEARCH FOCUS:
- Multilingual news websites
- International organization publications
- Government documents in multiple languages
- Educational content with translations
- Software documentation with localization
- Cultural and literary translations

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Languages available and content type
4. **Language Pairs**: Specific language combinations
5. **Translation Quality**: Assessment of translation accuracy
6. **Quality Score**: 1-10 rating for translation training
"""
            }

@dataclass
class ScrapingConfig:
    """Configuration for web scraping"""
    
    # Request settings
    timeout: int = 15
    max_retries: int = 3
    retry_delay: float = 1.0
    
    # Rate limiting
    requests_per_second: float = 0.5  # Conservative rate limiting
    burst_requests: int = 5
    
    # Content filtering
    min_content_length: int = 100
    max_content_length: int = 1_000_000  # 1MB per page
    
    # User agent rotation
    user_agents: List[str] = None
    
    # Blocked domains (respect robots.txt)
    blocked_domains: List[str] = None
    
    # Content extraction settings
    extract_metadata: bool = True
    clean_html: bool = True
    preserve_structure: bool = False
    
    def __post_init__(self):
        """Initialize default values"""
        if self.user_agents is None:
            self.user_agents = [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            ]
        
        if self.blocked_domains is None:
            self.blocked_domains = [
                'localhost',
                '127.0.0.1',
                '0.0.0.0',
                '10.',
                '172.',
                '192.168.',
                'internal.',
                'staging.',
                'test.',
                'dev.'
            ]

@dataclass
class ModelConfig:
    """Configuration for AI models"""
    
    # Model selection
    sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    summarization_model: str = "facebook/bart-large-cnn"
    ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"
    
    # Fallback models (lighter/faster)
    sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
    summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
    ner_fallback: str = "distilbert-base-cased"
    
    # Device configuration
    device: str = "auto"  # auto, cpu, cuda
    use_gpu: bool = True
    max_memory_mb: int = 4000
    
    # Processing settings
    max_sequence_length: int = 512
    batch_size: int = 8
    confidence_threshold: float = 0.7
    
    # Cache settings
    cache_models: bool = True
    model_cache_dir: str = "./model_cache"

@dataclass
class ExportConfig:
    """Configuration for dataset export"""
    
    # File settings
    max_file_size_mb: int = 100
    compression: bool = True
    encoding: str = "utf-8"
    
    # Format-specific settings
    json_indent: int = 2
    csv_delimiter: str = ","
    csv_quoting: int = 1  # csv.QUOTE_ALL
    
    # HuggingFace dataset settings
    hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
    hf_private: bool = True
    hf_token: Optional[str] = os.getenv('HF_TOKEN')
    
    # Metadata inclusion
    include_source_urls: bool = True
    include_timestamps: bool = True
    include_processing_info: bool = True
    include_confidence_scores: bool = True

@dataclass
class SecurityConfig:
    """Security and safety configuration"""
    
    # URL validation
    allow_local_urls: bool = False
    allow_private_ips: bool = False
    max_redirects: int = 5
    
    # Content filtering
    filter_adult_content: bool = True
    filter_spam: bool = True
    max_duplicate_content: float = 0.8  # Similarity threshold
    
    # Rate limiting enforcement
    enforce_rate_limits: bool = True
    respect_robots_txt: bool = True
    
    # Safety checks
    scan_for_malware: bool = False  # Requires additional dependencies
    validate_ssl: bool = True

@dataclass
class UIConfig:
    """User interface configuration"""
    
    # Theme settings
    theme: str = "soft"
    custom_css: bool = True
    dark_mode: bool = False
    
    # Interface settings
    max_preview_items: int = 10
    preview_text_length: int = 200
    show_progress_bars: bool = True
    
    # Advanced features
    enable_debug_mode: bool = False
    show_model_info: bool = True
    enable_export_preview: bool = True

# Global configuration instance
class Config:
    """Main configuration class combining all settings"""
    
    def __init__(self):
        self.perplexity = PerplexityConfig()
        self.scraping = ScrapingConfig()
        self.models = ModelConfig()
        self.export = ExportConfig()
        self.security = SecurityConfig()
        self.ui = UIConfig()
        
        # Application settings
        self.app_name = "AI Dataset Studio"
        self.version = "2.0.0"
        self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
        
        # Logging
        self.log_level = os.getenv('LOG_LEVEL', 'INFO')
        self.log_format = '%(asctime)s - %(levelname)s - %(message)s'
    
    def is_perplexity_enabled(self) -> bool:
        """Check if Perplexity AI is properly configured"""
        return bool(self.perplexity.api_key)
    
    def get_search_template(self, template_type: str, **kwargs) -> str:
        """Get formatted search template for Perplexity"""
        template = self.perplexity.search_templates.get(template_type, "")
        if template:
            return template.format(**kwargs)
        return ""
    
    def validate_url(self, url: str) -> bool:
        """Validate URL against security settings"""
        from urllib.parse import urlparse
        
        try:
            parsed = urlparse(url)
            
            # Check scheme
            if parsed.scheme not in ['http', 'https']:
                return False
            
            # Check for blocked domains
            netloc = parsed.netloc.lower()
            for blocked in self.security.blocked_domains:
                if blocked in netloc:
                    return False
            
            # Check for local/private IPs if not allowed
            if not self.security.allow_local_urls:
                if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
                    return False
            
            if not self.security.allow_private_ips:
                if any(private in netloc for private in ['10.', '172.', '192.168.']):
                    return False
            
            return True
            
        except Exception:
            return False

# Create global config instance
config = Config()

# Export commonly used configurations
PERPLEXITY_CONFIG = config.perplexity
SCRAPING_CONFIG = config.scraping
MODEL_CONFIG = config.models
EXPORT_CONFIG = config.export
SECURITY_CONFIG = config.security
UI_CONFIG = config.ui