Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

MagicMeWizard commited on Jul 1

Commit

4fc0c1e

verified ·

1 Parent(s): 135516a

Create utils.py

Browse files

Files changed (1) hide show

utils.py +462 -0

utils.py ADDED Viewed

	@@ -0,0 +1,462 @@

+"""
+Utility functions for AI Dataset Studio
+Common helpers for text processing, validation, and data manipulation
+"""
+import re
+import hashlib
+import json
+import csv
+import io
+from typing import List, Dict, Any, Optional, Tuple, Union
+from urllib.parse import urlparse, urljoin
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+def clean_text(text: str, aggressive: bool = False) -> str:
+    """
+    Clean text content with various strategies
+    Args:
+        text: Input text to clean
+        aggressive: Whether to apply aggressive cleaning
+    Returns:
+        Cleaned text
+    """
+    if not text:
+        return ""
+    # Basic cleaning
+    text = text.strip()
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove URLs if aggressive
+    if aggressive:
+        text = re.sub(r'http\S+|www\.\S+', '', text)
+        text = re.sub(r'\S+@\S+', '', text)  # Email addresses
+    # Fix common encoding issues
+    text = text.replace('â€™', "'")
+    text = text.replace('â€œ', '"')
+    text = text.replace('â€', '"')
+    text = text.replace('â€"', '—')
+    # Remove excessive punctuation
+    text = re.sub(r'[!?]{3,}', '!!!', text)
+    text = re.sub(r'\.{4,}', '...', text)
+    # Clean up quotes and apostrophes
+    text = re.sub(r'["""]', '"', text)
+    text = re.sub(r'[''']', "'", text)
+    return text.strip()
+def extract_urls_from_text(text: str) -> List[str]:
+    """Extract URLs from text content"""
+    url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
+    urls = re.findall(url_pattern, text)
+    return list(set(urls))  # Remove duplicates
+def validate_url(url: str) -> Tuple[bool, str]:
+    """
+    Validate URL format and basic security checks
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        if not url or not url.strip():
+            return False, "Empty URL"
+        url = url.strip()
+        # Basic format check
+        parsed = urlparse(url)
+        if not parsed.scheme:
+            return False, "Missing scheme (http:// or https://)"
+        if parsed.scheme not in ['http', 'https']:
+            return False, f"Invalid scheme: {parsed.scheme}"
+        if not parsed.netloc:
+            return False, "Invalid domain"
+        # Check for suspicious patterns
+        suspicious_patterns = [
+            r'localhost',
+            r'127\.0\.0\.1',
+            r'192\.168\.',
+            r'10\.',
+            r'172\.(1[6-9]|2[0-9]|3[01])\.'
+        ]
+        for pattern in suspicious_patterns:
+            if re.search(pattern, parsed.netloc, re.IGNORECASE):
+                return False, "Access to internal networks not allowed"
+        return True, "Valid URL"
+    except Exception as e:
+        return False, f"URL validation error: {str(e)}"
+def parse_urls_from_file(file_content: bytes, filename: str) -> List[str]:
+    """
+    Parse URLs from uploaded file content
+    Args:
+        file_content: File content as bytes
+        filename: Original filename for format detection
+    Returns:
+        List of extracted URLs
+    """
+    try:
+        # Decode content
+        try:
+            content = file_content.decode('utf-8')
+        except UnicodeDecodeError:
+            content = file_content.decode('latin-1')
+        urls = []
+        # Handle different file formats
+        if filename.lower().endswith('.csv'):
+            # Try to parse as CSV
+            reader = csv.DictReader(io.StringIO(content))
+            for row in reader:
+                # Look for URL column (flexible naming)
+                url_columns = ['url', 'URL', 'link', 'Link', 'href', 'address']
+                for col in url_columns:
+                    if col in row and row[col]:
+                        urls.append(row[col].strip())
+                        break
+        else:
+            # Treat as plain text (one URL per line)
+            lines = content.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and not line.startswith('#'):  # Skip comments
+                    # Extract URLs from line
+                    extracted = extract_urls_from_text(line)
+                    if extracted:
+                        urls.extend(extracted)
+                    elif validate_url(line)[0]:  # Check if line itself is a URL
+                        urls.append(line)
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_urls = []
+        for url in urls:
+            if url not in seen:
+                seen.add(url)
+                unique_urls.append(url)
+        return unique_urls
+    except Exception as e:
+        logger.error(f"Error parsing URLs from file: {e}")
+        return []
+def calculate_text_similarity(text1: str, text2: str) -> float:
+    """
+    Calculate similarity between two texts using simple methods
+    Returns:
+        Similarity score between 0 and 1
+    """
+    if not text1 or not text2:
+        return 0.0
+    # Simple character-level similarity
+    text1 = text1.lower().strip()
+    text2 = text2.lower().strip()
+    if text1 == text2:
+        return 1.0
+    # Jaccard similarity on words
+    words1 = set(text1.split())
+    words2 = set(text2.split())
+    if not words1 and not words2:
+        return 1.0
+    if not words1 or not words2:
+        return 0.0
+    intersection = len(words1.intersection(words2))
+    union = len(words1.union(words2))
+    return intersection / union if union > 0 else 0.0
+def detect_content_type(text: str) -> str:
+    """
+    Detect the type of content based on text analysis
+    Returns:
+        Content type string
+    """
+    if not text:
+        return "empty"
+    text_lower = text.lower()
+    # Check for common patterns
+    if any(word in text_lower for word in ['abstract:', 'introduction:', 'conclusion:', 'references:']):
+        return "academic"
+    elif any(word in text_lower for word in ['news', 'reported', 'according to', 'sources say']):
+        return "news"
+    elif any(word in text_lower for word in ['review', 'rating', 'stars', 'recommend']):
+        return "review"
+    elif any(word in text_lower for word in ['blog', 'posted by', 'share this']):
+        return "blog"
+    elif re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text):
+        return "dated_content"
+    else:
+        return "general"
+def extract_metadata_from_text(text: str) -> Dict[str, Any]:
+    """
+    Extract metadata from text content
+    Returns:
+        Dictionary of extracted metadata
+    """
+    metadata = {}
+    # Extract dates
+    date_patterns = [
+        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
+        r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
+        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'
+    ]
+    dates = []
+    for pattern in date_patterns:
+        dates.extend(re.findall(pattern, text, re.IGNORECASE))
+    if dates:
+        metadata['extracted_dates'] = dates[:5]  # Limit to first 5
+    # Extract numbers and statistics
+    numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text)
+    if numbers:
+        metadata['numbers'] = numbers[:10]  # Limit to first 10
+    # Extract email addresses
+    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
+    if emails:
+        metadata['emails'] = emails[:5]
+    # Extract phone numbers (basic pattern)
+    phones = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)
+    if phones:
+        metadata['phones'] = phones[:5]
+    # Extract capitalized words (potential names/entities)
+    capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', text)
+    if capitalized:
+        # Filter common words
+        common_words = {'The', 'This', 'That', 'There', 'Then', 'They', 'These', 'Those'}
+        filtered = [word for word in capitalized if word not in common_words]
+        metadata['capitalized_terms'] = list(set(filtered))[:20]
+    return metadata
+def generate_content_hash(text: str) -> str:
+    """Generate a hash for content deduplication"""
+    # Normalize text for hashing
+    normalized = re.sub(r'\s+', ' ', text.lower().strip())
+    return hashlib.md5(normalized.encode('utf-8')).hexdigest()
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human readable format"""
+    if size_bytes == 0:
+        return "0 B"
+    size_names = ["B", "KB", "MB", "GB"]
+    i = 0
+    while size_bytes >= 1024 and i < len(size_names) - 1:
+        size_bytes /= 1024.0
+        i += 1
+    return f"{size_bytes:.1f} {size_names[i]}"
+def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
+    """Estimate reading time in minutes"""
+    word_count = len(text.split())
+    return max(1, round(word_count / words_per_minute))
+def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
+    """Truncate text to maximum length with suffix"""
+    if len(text) <= max_length:
+        return text
+    return text[:max_length - len(suffix)] + suffix
+def create_filename_safe_string(text: str, max_length: int = 50) -> str:
+    """Create a filesystem-safe string from text"""
+    # Remove/replace problematic characters
+    safe_text = re.sub(r'[<>:"/\\|?*]', '_', text)
+    safe_text = re.sub(r'\s+', '_', safe_text)
+    safe_text = safe_text.strip('._')
+    # Truncate if too long
+    if len(safe_text) > max_length:
+        safe_text = safe_text[:max_length].rstrip('_')
+    return safe_text or "untitled"
+def validate_dataset_format(data: List[Dict[str, Any]], required_fields: List[str]) -> Tuple[bool, List[str]]:
+    """
+    Validate dataset format against required fields
+    Returns:
+        Tuple of (is_valid, list_of_errors)
+    """
+    errors = []
+    if not data:
+        errors.append("Dataset is empty")
+        return False, errors
+    # Check each item
+    for i, item in enumerate(data[:10]):  # Check first 10 items
+        if not isinstance(item, dict):
+            errors.append(f"Item {i} is not a dictionary")
+            continue
+        # Check required fields
+        for field in required_fields:
+            if field not in item:
+                errors.append(f"Item {i} missing required field: {field}")
+            elif not item[field]:  # Check for empty values
+                errors.append(f"Item {i} has empty value for field: {field}")
+    return len(errors) == 0, errors
+def create_progress_message(current: int, total: int, operation: str = "Processing") -> str:
+    """Create a formatted progress message"""
+    percentage = (current / total * 100) if total > 0 else 0
+    return f"{operation} {current}/{total} ({percentage:.1f}%)"
+def sanitize_text_for_json(text: str) -> str:
+    """Sanitize text for safe JSON serialization"""
+    if not text:
+        return ""
+    # Replace problematic characters
+    text = text.replace('\x00', '')  # Remove null bytes
+    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)  # Remove control characters
+    return text
+def extract_domain_from_url(url: str) -> str:
+    """Extract domain from URL"""
+    try:
+        parsed = urlparse(url)
+        return parsed.netloc.lower()
+    except:
+        return "unknown"
+def analyze_text_quality(text: str) -> Dict[str, Any]:
+    """
+    Analyze text quality and return metrics
+    Returns:
+        Dictionary with quality metrics
+    """
+    if not text:
+        return {'score': 0.0, 'issues': ['Empty text']}
+    issues = []
+    score = 1.0
+    # Length checks
+    word_count = len(text.split())
+    if word_count < 10:
+        issues.append('Too short (< 10 words)')
+        score -= 0.3
+    elif word_count < 50:
+        score -= 0.1
+    # Character checks
+    if len(text) < 100:
+        issues.append('Very short content')
+        score -= 0.2
+    # Language quality checks
+    uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text)
+    if uppercase_ratio > 0.3:
+        issues.append('Excessive uppercase')
+        score -= 0.2
+    # Punctuation checks
+    sentence_endings = text.count('.') + text.count('!') + text.count('?')
+    if word_count > 50 and sentence_endings < 2:
+        issues.append('Few sentence endings')
+        score -= 0.1
+    # Excessive repetition check
+    words = text.lower().split()
+    if len(words) > 10:
+        unique_words = set(words)
+        if len(unique_words) / len(words) < 0.5:
+            issues.append('High word repetition')
+            score -= 0.2
+    # Special character checks
+    special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
+    if special_char_ratio > 0.1:
+        issues.append('Many special characters')
+        score -= 0.1
+    return {
+        'score': max(0.0, score),
+        'word_count': word_count,
+        'char_count': len(text),
+        'uppercase_ratio': uppercase_ratio,
+        'special_char_ratio': special_char_ratio,
+        'issues': issues
+    }
+# Dataset template utilities
+def create_classification_example(text: str, label: str, confidence: float = 1.0) -> Dict[str, Any]:
+    """Create a text classification example"""
+    return {
+        'text': text,
+        'label': label,
+        'confidence': confidence
+    }
+def create_ner_example(text: str, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Create a named entity recognition example"""
+    return {
+        'text': text,
+        'entities': entities
+    }
+def create_qa_example(context: str, question: str, answer: str, answer_start: int = None) -> Dict[str, Any]:
+    """Create a question answering example"""
+    example = {
+        'context': context,
+        'question': question,
+        'answer': answer
+    }
+    if answer_start is not None:
+        example['answer_start'] = answer_start
+    return example
+def create_summarization_example(text: str, summary: str) -> Dict[str, Any]:
+    """Create a text summarization example"""
+    return {
+        'text': text,
+        'summary': summary
+    }