Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

MagicMeWizard commited on 28 days ago

Commit

ccc5d44

verified ·

1 Parent(s): ed05d05

Update config.py

Browse files

Files changed (1) hide show

config.py +374 -141

config.py CHANGED Viewed

@@ -1,204 +1,437 @@
 """
-Configuration settings for AI Web Scraper
-Centralized configuration management for security, performance, and features
 """
 import os
-from typing import Dict, List, Optional
 from dataclasses import dataclass
 @dataclass
-class SecurityConfig:
-    """Security-related configuration"""
-    # URL validation settings
-    allowed_schemes: List[str] = None
-    blocked_domains: List[str] = None
-    max_url_length: int = 2048
-    # Rate limiting
     requests_per_minute: int = 30
-    requests_per_hour: int = 500
-    # Content safety
-    max_content_size: int = 10 * 1024 * 1024  # 10MB
-    max_processing_time: int = 60  # seconds
     def __post_init__(self):
-        if self.allowed_schemes is None:
-            self.allowed_schemes = ['http', 'https']
         if self.blocked_domains is None:
             self.blocked_domains = [
-                'localhost', '127.0.0.1', '0.0.0.0',
-                '192.168.', '10.', '172.16.', '172.17.',
-                '172.18.', '172.19.', '172.20.', '172.21.',
-                '172.22.', '172.23.', '172.24.', '172.25.',
-                '172.26.', '172.27.', '172.28.', '172.29.',
-                '172.30.', '172.31.'
             ]
 @dataclass
 class ModelConfig:
-    """AI model configuration"""
-    # Primary summarization model
-    primary_model: str = "facebook/bart-large-cnn"
-    # Fallback model for faster processing
-    fallback_model: str = "sshleifer/distilbart-cnn-12-6"
-    # Model parameters
-    max_input_length: int = 1024
-    max_summary_length: int = 500
-    min_summary_length: int = 30
-    # Performance settings
     device: str = "auto"  # auto, cpu, cuda
-    batch_size: int = 1
-    use_fast_tokenizer: bool = True
 @dataclass
-class ScrapingConfig:
-    """Web scraping configuration"""
-    # Request settings
-    timeout: int = 15
-    max_retries: int = 3
-    retry_delay: int = 1
-    # User agent string
-    user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)"
-    # Content extraction
-    min_content_length: int = 100
-    max_content_length: int = 100000
-    # Robots.txt settings
     respect_robots_txt: bool = True
-    robots_cache_duration: int = 3600  # seconds
 @dataclass
 class UIConfig:
     """User interface configuration"""
-    # Default values
-    default_summary_length: int = 300
-    max_summary_length: int = 500
-    min_summary_length: int = 100
-    # Interface settings
-    enable_batch_processing: bool = True
-    max_batch_size: int = 10
-    show_advanced_options: bool = False
-    # Export settings
-    supported_export_formats: List[str] = None
-    def __post_init__(self):
-        if self.supported_export_formats is None:
-            self.supported_export_formats = ["CSV", "JSON"]
 class Config:
-    """Main configuration class"""
     def __init__(self):
-        self.security = SecurityConfig()
-        self.models = ModelConfig()
         self.scraping = ScrapingConfig()
         self.ui = UIConfig()
-        # Load from environment variables if available
-        self._load_from_env()
-    def _load_from_env(self):
-        """Load configuration from environment variables"""
-        # Security settings
-        if os.getenv('MAX_REQUESTS_PER_MINUTE'):
-            self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE'))
-        if os.getenv('MAX_CONTENT_SIZE'):
-            self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE'))
-        # Model settings
-        if os.getenv('PRIMARY_MODEL'):
-            self.models.primary_model = os.getenv('PRIMARY_MODEL')
-        if os.getenv('FALLBACK_MODEL'):
-            self.models.fallback_model = os.getenv('FALLBACK_MODEL')
-        if os.getenv('DEVICE'):
-            self.models.device = os.getenv('DEVICE')
-        # Scraping settings
-        if os.getenv('REQUEST_TIMEOUT'):
-            self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT'))
-        if os.getenv('USER_AGENT'):
-            self.scraping.user_agent = os.getenv('USER_AGENT')
-        if os.getenv('RESPECT_ROBOTS_TXT'):
-            self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true'
-    def get_model_device(self) -> str:
-        """Get the appropriate device for model inference"""
-        if self.models.device == "auto":
-            try:
-                import torch
-                return "cuda" if torch.cuda.is_available() else "cpu"
-            except ImportError:
-                return "cpu"
-        return self.models.device
-    def is_url_allowed(self, url: str) -> bool:
-        """Check if URL is allowed based on security settings"""
         from urllib.parse import urlparse
         try:
             parsed = urlparse(url)
             # Check scheme
-            if parsed.scheme not in self.security.allowed_schemes:
                 return False
-            # Check blocked domains
-            hostname = parsed.hostname or ''
             for blocked in self.security.blocked_domains:
-                if blocked in hostname:
                     return False
-            # Check URL length
-            if len(url) > self.security.max_url_length:
-                return False
             return True
         except Exception:
             return False
-    def get_request_headers(self) -> Dict[str, str]:
-        """Get standard request headers"""
-        return {
-            'User-Agent': self.scraping.user_agent,
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Accept-Encoding': 'gzip, deflate',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-        }
-# Global configuration instance
 config = Config()
-# Environment-specific overrides for Hugging Face Spaces
-if os.getenv('SPACE_ID'):
-    # Running on Hugging Face Spaces
-    config.models.device = "auto"
-    config.security.requests_per_minute = 20  # More conservative on shared infrastructure
-    config.scraping.timeout = 10  # Shorter timeout on shared infrastructure
-    # Enable GPU if available
-    if os.getenv('CUDA_VISIBLE_DEVICES'):
-        config.models.device = "cuda"
-# Development mode overrides
-if os.getenv('ENVIRONMENT') == 'development':
-    config.security.requests_per_minute = 100
-    config.scraping.timeout = 30
-    config.ui.show_advanced_options = True

 """
+⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
 """
 import os
 from dataclasses import dataclass
+from typing import List, Dict, Optional
 @dataclass
+class PerplexityConfig:
+    """Configuration for Perplexity AI integration"""
+    # API Configuration
+    api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
+    base_url: str = "https://api.perplexity.ai"
+    model: str = "llama-3.1-sonar-large-128k-online"
+    # Rate Limiting
     requests_per_minute: int = 30
+    request_timeout: int = 30
+    max_retries: int = 3
+    min_request_interval: float = 1.0  # seconds
+    # Search Configuration
+    default_max_sources: int = 20
+    max_sources_limit: int = 50
+    min_sources: int = 5
+    # Quality Thresholds
+    min_relevance_score: float = 3.0
+    min_content_length: int = 100
+    max_content_length: int = 10_000_000  # 10MB
+    # Search Templates
+    search_templates: Dict[str, str] = None
     def __post_init__(self):
+        """Initialize search templates after creation"""
+        if self.search_templates is None:
+            self.search_templates = {
+                "sentiment_analysis": """
+Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Sources with clear positive, negative, or neutral sentiment
+- Text suitable for sentiment classification training
+- Diverse content types (reviews, social media, news, forums)
+- Avoid heavily biased or extreme content
+- Include metadata when possible (ratings, timestamps, etc.)
+SEARCH FOCUS:
+- Product reviews and customer feedback
+- Social media posts and comments
+- News articles with opinion content
+- Blog posts with clear sentiment
+- Forum discussions and community posts
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Why this source is good for sentiment analysis
+4. **Content Type**: [review/social/news/blog/forum]
+5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content
+6. **Quality Score**: 1-10 rating for ML training suitability
+""",
+                "text_classification": """
+Find {max_sources} diverse, well-categorized sources for text classification training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Sources with clear, distinct categories or topics
+- Consistent content structure within categories
+- Sufficient variety within each category
+- Professional or semi-professional content quality
+- Avoid overly niche or specialized content
+SEARCH FOCUS:
+- News articles with clear sections (politics, sports, technology, etc.)
+- Academic papers with subject classifications
+- E-commerce product descriptions with categories
+- Blog posts with clear topical focus
+- Government documents with departmental classifications
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Content type and classification scheme
+4. **Categories Available**: List of categories/classes present
+5. **Content Volume**: Estimated amount of data per category
+6. **Quality Score**: 1-10 rating for classification training
+""",
+                "named_entity_recognition": """
+Find {max_sources} text-rich sources with clear named entities for NER training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Rich in named entities (people, places, organizations, dates, etc.)
+- Clear, well-written text (not fragmented or poorly formatted)
+- Diverse entity types and contexts
+- Professional writing quality
+- Entities are clearly identifiable in context
+SEARCH FOCUS:
+- News articles and press releases
+- Biographical content and profiles
+- Business and financial reports
+- Historical documents and articles
+- Academic papers and research
+- Government publications
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Types of entities commonly found
+4. **Entity Density**: Expected frequency of named entities
+5. **Text Quality**: Assessment of writing clarity
+6. **Quality Score**: 1-10 rating for NER training
+""",
+                "question_answering": """
+Find {max_sources} sources with clear question-answer patterns for QA training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Explicit Q&A format OR clear factual content suitable for QA generation
+- Questions and answers are clearly delineated
+- Factual, verifiable information
+- Diverse question types (factual, definitional, procedural, etc.)
+- Professional quality content
+SEARCH FOCUS:
+- FAQ pages and help documentation
+- Interview transcripts and Q&A sessions
+- Educational content with questions
+- Technical documentation with examples
+- Customer support knowledge bases
+- Stack Overflow and similar Q&A platforms
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Q&A format type and subject matter
+4. **Question Types**: Types of questions typically found
+5. **Answer Quality**: Assessment of answer completeness
+6. **Quality Score**: 1-10 rating for QA training
+""",
+                "text_summarization": """
+Find {max_sources} sources with substantial, well-structured content for summarization training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Long-form content (articles, reports, papers)
+- Clear structure with main points
+- Professional writing quality
+- Self-contained content (doesn't rely heavily on external references)
+- Diverse content types and subjects
+SEARCH FOCUS:
+- News articles and investigative reports
+- Research papers and academic articles
+- Long-form blog posts and essays
+- Government reports and white papers
+- Industry analysis and market reports
+- Review articles and meta-analyses
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Content length and structure
+4. **Main Topics**: Key subjects covered
+5. **Summarization Potential**: How well-suited for summary generation
+6. **Quality Score**: 1-10 rating for summarization training
+""",
+                "translation": """
+Find {max_sources} parallel or multilingual content for translation training:
+PROJECT: {project_description}
+REQUIREMENTS:
+- Content available in multiple languages
+- High translation quality (professional or native-level)
+- Parallel content alignment when possible
+- Diverse domains and text types
+- Clear source and target language identification
+SEARCH FOCUS:
+- Multilingual news websites
+- International organization publications
+- Government documents in multiple languages
+- Educational content with translations
+- Software documentation with localization
+- Cultural and literary translations
+OUTPUT FORMAT:
+For each source provide:
+1. **URL**: Direct link to content
+2. **Title**: Clear, descriptive title
+3. **Description**: Languages available and content type
+4. **Language Pairs**: Specific language combinations
+5. **Translation Quality**: Assessment of translation accuracy
+6. **Quality Score**: 1-10 rating for translation training
+"""
+            }
+@dataclass
+class ScrapingConfig:
+    """Configuration for web scraping"""
+    # Request settings
+    timeout: int = 15
+    max_retries: int = 3
+    retry_delay: float = 1.0
+    # Rate limiting
+    requests_per_second: float = 0.5  # Conservative rate limiting
+    burst_requests: int = 5
+    # Content filtering
+    min_content_length: int = 100
+    max_content_length: int = 1_000_000  # 1MB per page
+    # User agent rotation
+    user_agents: List[str] = None
+    # Blocked domains (respect robots.txt)
+    blocked_domains: List[str] = None
+    # Content extraction settings
+    extract_metadata: bool = True
+    clean_html: bool = True
+    preserve_structure: bool = False
+    def __post_init__(self):
+        """Initialize default values"""
+        if self.user_agents is None:
+            self.user_agents = [
+                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            ]
         if self.blocked_domains is None:
             self.blocked_domains = [
+                'localhost',
+                '127.0.0.1',
+                '0.0.0.0',
+                '10.',
+                '172.',
+                '192.168.',
+                'internal.',
+                'staging.',
+                'test.',
+                'dev.'
             ]
 @dataclass
 class ModelConfig:
+    """Configuration for AI models"""
+    # Model selection
+    sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
+    summarization_model: str = "facebook/bart-large-cnn"
+    ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"
+    # Fallback models (lighter/faster)
+    sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
+    summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
+    ner_fallback: str = "distilbert-base-cased"
+    # Device configuration
     device: str = "auto"  # auto, cpu, cuda
+    use_gpu: bool = True
+    max_memory_mb: int = 4000
+    # Processing settings
+    max_sequence_length: int = 512
+    batch_size: int = 8
+    confidence_threshold: float = 0.7
+    # Cache settings
+    cache_models: bool = True
+    model_cache_dir: str = "./model_cache"
 @dataclass
+class ExportConfig:
+    """Configuration for dataset export"""
+    # File settings
+    max_file_size_mb: int = 100
+    compression: bool = True
+    encoding: str = "utf-8"
+    # Format-specific settings
+    json_indent: int = 2
+    csv_delimiter: str = ","
+    csv_quoting: int = 1  # csv.QUOTE_ALL
+    # HuggingFace dataset settings
+    hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
+    hf_private: bool = True
+    hf_token: Optional[str] = os.getenv('HF_TOKEN')
+    # Metadata inclusion
+    include_source_urls: bool = True
+    include_timestamps: bool = True
+    include_processing_info: bool = True
+    include_confidence_scores: bool = True
+@dataclass
+class SecurityConfig:
+    """Security and safety configuration"""
+    # URL validation
+    allow_local_urls: bool = False
+    allow_private_ips: bool = False
+    max_redirects: int = 5
+    # Content filtering
+    filter_adult_content: bool = True
+    filter_spam: bool = True
+    max_duplicate_content: float = 0.8  # Similarity threshold
+    # Rate limiting enforcement
+    enforce_rate_limits: bool = True
     respect_robots_txt: bool = True
+    # Safety checks
+    scan_for_malware: bool = False  # Requires additional dependencies
+    validate_ssl: bool = True
 @dataclass
 class UIConfig:
     """User interface configuration"""
+    # Theme settings
+    theme: str = "soft"
+    custom_css: bool = True
+    dark_mode: bool = False
+    # Interface settings
+    max_preview_items: int = 10
+    preview_text_length: int = 200
+    show_progress_bars: bool = True
+    # Advanced features
+    enable_debug_mode: bool = False
+    show_model_info: bool = True
+    enable_export_preview: bool = True
+# Global configuration instance
 class Config:
+    """Main configuration class combining all settings"""
     def __init__(self):
+        self.perplexity = PerplexityConfig()
         self.scraping = ScrapingConfig()
+        self.models = ModelConfig()
+        self.export = ExportConfig()
+        self.security = SecurityConfig()
         self.ui = UIConfig()
+        # Application settings
+        self.app_name = "AI Dataset Studio"
+        self.version = "2.0.0"
+        self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
+        # Logging
+        self.log_level = os.getenv('LOG_LEVEL', 'INFO')
+        self.log_format = '%(asctime)s - %(levelname)s - %(message)s'
+    def is_perplexity_enabled(self) -> bool:
+        """Check if Perplexity AI is properly configured"""
+        return bool(self.perplexity.api_key)
+    def get_search_template(self, template_type: str, **kwargs) -> str:
+        """Get formatted search template for Perplexity"""
+        template = self.perplexity.search_templates.get(template_type, "")
+        if template:
+            return template.format(**kwargs)
+        return ""
+    def validate_url(self, url: str) -> bool:
+        """Validate URL against security settings"""
         from urllib.parse import urlparse
         try:
             parsed = urlparse(url)
             # Check scheme
+            if parsed.scheme not in ['http', 'https']:
                 return False
+            # Check for blocked domains
+            netloc = parsed.netloc.lower()
             for blocked in self.security.blocked_domains:
+                if blocked in netloc:
                     return False
+            # Check for local/private IPs if not allowed
+            if not self.security.allow_local_urls:
+                if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
+                    return False
+            if not self.security.allow_private_ips:
+                if any(private in netloc for private in ['10.', '172.', '192.168.']):
+                    return False
             return True
         except Exception:
             return False
+# Create global config instance
 config = Config()
+# Export commonly used configurations
+PERPLEXITY_CONFIG = config.perplexity
+SCRAPING_CONFIG = config.scraping
+MODEL_CONFIG = config.models
+EXPORT_CONFIG = config.export
+SECURITY_CONFIG = config.security
+UI_CONFIG = config.ui