Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

File size: 14,105 Bytes

f9f65ef
ccc5d44
f9f65ef
 
 
 
ccc5d44
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
f9f65ef
 
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
ccc5d44
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
 
ccc5d44
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
 
 
 
 
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
f9f65ef
ccc5d44
f9f65ef
ccc5d44
f9f65ef
 
ccc5d44
f9f65ef
ccc5d44
 
 
f9f65ef
 
ccc5d44
 
 
 
f9f65ef
ccc5d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9f65ef
 
 
 
 
 
ccc5d44
f9f65ef
 
ccc5d44
 
f9f65ef
ccc5d44
f9f65ef
 
ccc5d44
 
 
 
 
 
 
 
f9f65ef
 
 
 
 
 
ccc5d44
f9f65ef
 
ccc5d44

"""
⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
"""

import os
from dataclasses import dataclass
from typing import List, Dict, Optional

@dataclass
class PerplexityConfig:
    """Configuration for Perplexity AI integration"""
    
    # API Configuration
    api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
    base_url: str = "https://api.perplexity.ai"
    model: str = "llama-3.1-sonar-large-128k-online"
    
    # Rate Limiting
    requests_per_minute: int = 30
    request_timeout: int = 30
    max_retries: int = 3
    min_request_interval: float = 1.0  # seconds
    
    # Search Configuration
    default_max_sources: int = 20
    max_sources_limit: int = 50
    min_sources: int = 5
    
    # Quality Thresholds
    min_relevance_score: float = 3.0
    min_content_length: int = 100
    max_content_length: int = 10_000_000  # 10MB
    
    # Search Templates
    search_templates: Dict[str, str] = None
    
    def __post_init__(self):
        """Initialize search templates after creation"""
        if self.search_templates is None:
            self.search_templates = {
                "sentiment_analysis": """
Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear positive, negative, or neutral sentiment
- Text suitable for sentiment classification training
- Diverse content types (reviews, social media, news, forums)
- Avoid heavily biased or extreme content
- Include metadata when possible (ratings, timestamps, etc.)

SEARCH FOCUS:
- Product reviews and customer feedback
- Social media posts and comments
- News articles with opinion content
- Blog posts with clear sentiment
- Forum discussions and community posts

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Why this source is good for sentiment analysis
4. **Content Type**: [review/social/news/blog/forum]
5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content
6. **Quality Score**: 1-10 rating for ML training suitability
""",
                
                "text_classification": """
Find {max_sources} diverse, well-categorized sources for text classification training:

PROJECT: {project_description}

REQUIREMENTS:
- Sources with clear, distinct categories or topics
- Consistent content structure within categories
- Sufficient variety within each category
- Professional or semi-professional content quality
- Avoid overly niche or specialized content

SEARCH FOCUS:
- News articles with clear sections (politics, sports, technology, etc.)
- Academic papers with subject classifications
- E-commerce product descriptions with categories
- Blog posts with clear topical focus
- Government documents with departmental classifications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content type and classification scheme
4. **Categories Available**: List of categories/classes present
5. **Content Volume**: Estimated amount of data per category
6. **Quality Score**: 1-10 rating for classification training
""",
                
                "named_entity_recognition": """
Find {max_sources} text-rich sources with clear named entities for NER training:

PROJECT: {project_description}

REQUIREMENTS:
- Rich in named entities (people, places, organizations, dates, etc.)
- Clear, well-written text (not fragmented or poorly formatted)
- Diverse entity types and contexts
- Professional writing quality
- Entities are clearly identifiable in context

SEARCH FOCUS:
- News articles and press releases
- Biographical content and profiles
- Business and financial reports
- Historical documents and articles
- Academic papers and research
- Government publications

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Types of entities commonly found
4. **Entity Density**: Expected frequency of named entities
5. **Text Quality**: Assessment of writing clarity
6. **Quality Score**: 1-10 rating for NER training
""",
                
                "question_answering": """
Find {max_sources} sources with clear question-answer patterns for QA training:

PROJECT: {project_description}

REQUIREMENTS:
- Explicit Q&A format OR clear factual content suitable for QA generation
- Questions and answers are clearly delineated
- Factual, verifiable information
- Diverse question types (factual, definitional, procedural, etc.)
- Professional quality content

SEARCH FOCUS:
- FAQ pages and help documentation
- Interview transcripts and Q&A sessions
- Educational content with questions
- Technical documentation with examples
- Customer support knowledge bases
- Stack Overflow and similar Q&A platforms

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Q&A format type and subject matter
4. **Question Types**: Types of questions typically found
5. **Answer Quality**: Assessment of answer completeness
6. **Quality Score**: 1-10 rating for QA training
""",
                
                "text_summarization": """
Find {max_sources} sources with substantial, well-structured content for summarization training:

PROJECT: {project_description}

REQUIREMENTS:
- Long-form content (articles, reports, papers)
- Clear structure with main points
- Professional writing quality
- Self-contained content (doesn't rely heavily on external references)
- Diverse content types and subjects

SEARCH FOCUS:
- News articles and investigative reports
- Research papers and academic articles
- Long-form blog posts and essays
- Government reports and white papers
- Industry analysis and market reports
- Review articles and meta-analyses

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Content length and structure
4. **Main Topics**: Key subjects covered
5. **Summarization Potential**: How well-suited for summary generation
6. **Quality Score**: 1-10 rating for summarization training
""",
                
                "translation": """
Find {max_sources} parallel or multilingual content for translation training:

PROJECT: {project_description}

REQUIREMENTS:
- Content available in multiple languages
- High translation quality (professional or native-level)
- Parallel content alignment when possible
- Diverse domains and text types
- Clear source and target language identification

SEARCH FOCUS:
- Multilingual news websites
- International organization publications
- Government documents in multiple languages
- Educational content with translations
- Software documentation with localization
- Cultural and literary translations

OUTPUT FORMAT:
For each source provide:
1. **URL**: Direct link to content
2. **Title**: Clear, descriptive title
3. **Description**: Languages available and content type
4. **Language Pairs**: Specific language combinations
5. **Translation Quality**: Assessment of translation accuracy
6. **Quality Score**: 1-10 rating for translation training
"""
            }

@dataclass
class ScrapingConfig:
    """Configuration for web scraping"""
    
    # Request settings
    timeout: int = 15
    max_retries: int = 3
    retry_delay: float = 1.0
    
    # Rate limiting
    requests_per_second: float = 0.5  # Conservative rate limiting
    burst_requests: int = 5
    
    # Content filtering
    min_content_length: int = 100
    max_content_length: int = 1_000_000  # 1MB per page
    
    # User agent rotation
    user_agents: List[str] = None
    
    # Blocked domains (respect robots.txt)
    blocked_domains: List[str] = None
    
    # Content extraction settings
    extract_metadata: bool = True
    clean_html: bool = True
    preserve_structure: bool = False
    
    def __post_init__(self):
        """Initialize default values"""
        if self.user_agents is None:
            self.user_agents = [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            ]
        
        if self.blocked_domains is None:
            self.blocked_domains = [
                'localhost',
                '127.0.0.1',
                '0.0.0.0',
                '10.',
                '172.',
                '192.168.',
                'internal.',
                'staging.',
                'test.',
                'dev.'
            ]

@dataclass
class ModelConfig:
    """Configuration for AI models"""
    
    # Model selection
    sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    summarization_model: str = "facebook/bart-large-cnn"
    ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"
    
    # Fallback models (lighter/faster)
    sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
    summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
    ner_fallback: str = "distilbert-base-cased"
    
    # Device configuration
    device: str = "auto"  # auto, cpu, cuda
    use_gpu: bool = True
    max_memory_mb: int = 4000
    
    # Processing settings
    max_sequence_length: int = 512
    batch_size: int = 8
    confidence_threshold: float = 0.7
    
    # Cache settings
    cache_models: bool = True
    model_cache_dir: str = "./model_cache"

@dataclass
class ExportConfig:
    """Configuration for dataset export"""
    
    # File settings
    max_file_size_mb: int = 100
    compression: bool = True
    encoding: str = "utf-8"
    
    # Format-specific settings
    json_indent: int = 2
    csv_delimiter: str = ","
    csv_quoting: int = 1  # csv.QUOTE_ALL
    
    # HuggingFace dataset settings
    hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
    hf_private: bool = True
    hf_token: Optional[str] = os.getenv('HF_TOKEN')
    
    # Metadata inclusion
    include_source_urls: bool = True
    include_timestamps: bool = True
    include_processing_info: bool = True
    include_confidence_scores: bool = True

@dataclass
class SecurityConfig:
    """Security and safety configuration"""
    
    # URL validation
    allow_local_urls: bool = False
    allow_private_ips: bool = False
    max_redirects: int = 5
    
    # Content filtering
    filter_adult_content: bool = True
    filter_spam: bool = True
    max_duplicate_content: float = 0.8  # Similarity threshold
    
    # Rate limiting enforcement
    enforce_rate_limits: bool = True
    respect_robots_txt: bool = True
    
    # Safety checks
    scan_for_malware: bool = False  # Requires additional dependencies
    validate_ssl: bool = True

@dataclass
class UIConfig:
    """User interface configuration"""
    
    # Theme settings
    theme: str = "soft"
    custom_css: bool = True
    dark_mode: bool = False
    
    # Interface settings
    max_preview_items: int = 10
    preview_text_length: int = 200
    show_progress_bars: bool = True
    
    # Advanced features
    enable_debug_mode: bool = False
    show_model_info: bool = True
    enable_export_preview: bool = True

# Global configuration instance
class Config:
    """Main configuration class combining all settings"""
    
    def __init__(self):
        self.perplexity = PerplexityConfig()
        self.scraping = ScrapingConfig()
        self.models = ModelConfig()
        self.export = ExportConfig()
        self.security = SecurityConfig()
        self.ui = UIConfig()
        
        # Application settings
        self.app_name = "AI Dataset Studio"
        self.version = "2.0.0"
        self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
        
        # Logging
        self.log_level = os.getenv('LOG_LEVEL', 'INFO')
        self.log_format = '%(asctime)s - %(levelname)s - %(message)s'
    
    def is_perplexity_enabled(self) -> bool:
        """Check if Perplexity AI is properly configured"""
        return bool(self.perplexity.api_key)
    
    def get_search_template(self, template_type: str, **kwargs) -> str:
        """Get formatted search template for Perplexity"""
        template = self.perplexity.search_templates.get(template_type, "")
        if template:
            return template.format(**kwargs)
        return ""
    
    def validate_url(self, url: str) -> bool:
        """Validate URL against security settings"""
        from urllib.parse import urlparse
        
        try:
            parsed = urlparse(url)
            
            # Check scheme
            if parsed.scheme not in ['http', 'https']:
                return False
            
            # Check for blocked domains
            netloc = parsed.netloc.lower()
            for blocked in self.security.blocked_domains:
                if blocked in netloc:
                    return False
            
            # Check for local/private IPs if not allowed
            if not self.security.allow_local_urls:
                if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
                    return False
            
            if not self.security.allow_private_ips:
                if any(private in netloc for private in ['10.', '172.', '192.168.']):
                    return False
            
            return True
            
        except Exception:
            return False

# Create global config instance
config = Config()

# Export commonly used configurations
PERPLEXITY_CONFIG = config.perplexity
SCRAPING_CONFIG = config.scraping
MODEL_CONFIG = config.models
EXPORT_CONFIG = config.export
SECURITY_CONFIG = config.security
UI_CONFIG = config.ui