Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

MagicMeWizard commited on 5 days ago

Commit

135516a

verified ·

1 Parent(s): 23fb9fd

Delete utils.py

Browse files

Files changed (1) hide show

utils.py +0 -462

utils.py DELETED Viewed

@@ -1,462 +0,0 @@
-"""
-Utility functions for AI Dataset Studio
-Common helpers for text processing, validation, and data manipulation
-"""
-import re
-import hashlib
-import json
-import csv
-import io
-from typing import List, Dict, Any, Optional, Tuple, Union
-from urllib.parse import urlparse, urljoin
-from datetime import datetime
-import logging
-logger = logging.getLogger(__name__)
-def clean_text(text: str, aggressive: bool = False) -> str:
-    """
-    Clean text content with various strategies
-    Args:
-        text: Input text to clean
-        aggressive: Whether to apply aggressive cleaning
-    Returns:
-        Cleaned text
-    """
-    if not text:
-        return ""
-    # Basic cleaning
-    text = text.strip()
-    # Remove excessive whitespace
-    text = re.sub(r'\s+', ' ', text)
-    # Remove URLs if aggressive
-    if aggressive:
-        text = re.sub(r'http\S+|www\.\S+', '', text)
-        text = re.sub(r'\S+@\S+', '', text)  # Email addresses
-    # Fix common encoding issues
-    text = text.replace('â€™', "'")
-    text = text.replace('â€œ', '"')
-    text = text.replace('â€', '"')
-    text = text.replace('â€"', '—')
-    # Remove excessive punctuation
-    text = re.sub(r'[!?]{3,}', '!!!', text)
-    text = re.sub(r'\.{4,}', '...', text)
-    # Clean up quotes and apostrophes
-    text = re.sub(r'["""]', '"', text)
-    text = re.sub(r'[''']', "'", text)
-    return text.strip()
-def extract_urls_from_text(text: str) -> List[str]:
-    """Extract URLs from text content"""
-    url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
-    urls = re.findall(url_pattern, text)
-    return list(set(urls))  # Remove duplicates
-def validate_url(url: str) -> Tuple[bool, str]:
-    """
-    Validate URL format and basic security checks
-    Returns:
-        Tuple of (is_valid, error_message)
-    """
-    try:
-        if not url or not url.strip():
-            return False, "Empty URL"
-        url = url.strip()
-        # Basic format check
-        parsed = urlparse(url)
-        if not parsed.scheme:
-            return False, "Missing scheme (http:// or https://)"
-        if parsed.scheme not in ['http', 'https']:
-            return False, f"Invalid scheme: {parsed.scheme}"
-        if not parsed.netloc:
-            return False, "Invalid domain"
-        # Check for suspicious patterns
-        suspicious_patterns = [
-            r'localhost',
-            r'127\.0\.0\.1',
-            r'192\.168\.',
-            r'10\.',
-            r'172\.(1[6-9]|2[0-9]|3[01])\.'
-        ]
-        for pattern in suspicious_patterns:
-            if re.search(pattern, parsed.netloc, re.IGNORECASE):
-                return False, "Access to internal networks not allowed"
-        return True, "Valid URL"
-    except Exception as e:
-        return False, f"URL validation error: {str(e)}"
-def parse_urls_from_file(file_content: bytes, filename: str) -> List[str]:
-    """
-    Parse URLs from uploaded file content
-    Args:
-        file_content: File content as bytes
-        filename: Original filename for format detection
-    Returns:
-        List of extracted URLs
-    """
-    try:
-        # Decode content
-        try:
-            content = file_content.decode('utf-8')
-        except UnicodeDecodeError:
-            content = file_content.decode('latin-1')
-        urls = []
-        # Handle different file formats
-        if filename.lower().endswith('.csv'):
-            # Try to parse as CSV
-            reader = csv.DictReader(io.StringIO(content))
-            for row in reader:
-                # Look for URL column (flexible naming)
-                url_columns = ['url', 'URL', 'link', 'Link', 'href', 'address']
-                for col in url_columns:
-                    if col in row and row[col]:
-                        urls.append(row[col].strip())
-                        break
-        else:
-            # Treat as plain text (one URL per line)
-            lines = content.split('\n')
-            for line in lines:
-                line = line.strip()
-                if line and not line.startswith('#'):  # Skip comments
-                    # Extract URLs from line
-                    extracted = extract_urls_from_text(line)
-                    if extracted:
-                        urls.extend(extracted)
-                    elif validate_url(line)[0]:  # Check if line itself is a URL
-                        urls.append(line)
-        # Remove duplicates while preserving order
-        seen = set()
-        unique_urls = []
-        for url in urls:
-            if url not in seen:
-                seen.add(url)
-                unique_urls.append(url)
-        return unique_urls
-    except Exception as e:
-        logger.error(f"Error parsing URLs from file: {e}")
-        return []
-def calculate_text_similarity(text1: str, text2: str) -> float:
-    """
-    Calculate similarity between two texts using simple methods
-    Returns:
-        Similarity score between 0 and 1
-    """
-    if not text1 or not text2:
-        return 0.0
-    # Simple character-level similarity
-    text1 = text1.lower().strip()
-    text2 = text2.lower().strip()
-    if text1 == text2:
-        return 1.0
-    # Jaccard similarity on words
-    words1 = set(text1.split())
-    words2 = set(text2.split())
-    if not words1 and not words2:
-        return 1.0
-    if not words1 or not words2:
-        return 0.0
-    intersection = len(words1.intersection(words2))
-    union = len(words1.union(words2))
-    return intersection / union if union > 0 else 0.0
-def detect_content_type(text: str) -> str:
-    """
-    Detect the type of content based on text analysis
-    Returns:
-        Content type string
-    """
-    if not text:
-        return "empty"
-    text_lower = text.lower()
-    # Check for common patterns
-    if any(word in text_lower for word in ['abstract:', 'introduction:', 'conclusion:', 'references:']):
-        return "academic"
-    elif any(word in text_lower for word in ['news', 'reported', 'according to', 'sources say']):
-        return "news"
-    elif any(word in text_lower for word in ['review', 'rating', 'stars', 'recommend']):
-        return "review"
-    elif any(word in text_lower for word in ['blog', 'posted by', 'share this']):
-        return "blog"
-    elif re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text):
-        return "dated_content"
-    else:
-        return "general"
-def extract_metadata_from_text(text: str) -> Dict[str, Any]:
-    """
-    Extract metadata from text content
-    Returns:
-        Dictionary of extracted metadata
-    """
-    metadata = {}
-    # Extract dates
-    date_patterns = [
-        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
-        r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
-        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'
-    ]
-    dates = []
-    for pattern in date_patterns:
-        dates.extend(re.findall(pattern, text, re.IGNORECASE))
-    if dates:
-        metadata['extracted_dates'] = dates[:5]  # Limit to first 5
-    # Extract numbers and statistics
-    numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text)
-    if numbers:
-        metadata['numbers'] = numbers[:10]  # Limit to first 10
-    # Extract email addresses
-    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
-    if emails:
-        metadata['emails'] = emails[:5]
-    # Extract phone numbers (basic pattern)
-    phones = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)
-    if phones:
-        metadata['phones'] = phones[:5]
-    # Extract capitalized words (potential names/entities)
-    capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', text)
-    if capitalized:
-        # Filter common words
-        common_words = {'The', 'This', 'That', 'There', 'Then', 'They', 'These', 'Those'}
-        filtered = [word for word in capitalized if word not in common_words]
-        metadata['capitalized_terms'] = list(set(filtered))[:20]
-    return metadata
-def generate_content_hash(text: str) -> str:
-    """Generate a hash for content deduplication"""
-    # Normalize text for hashing
-    normalized = re.sub(r'\s+', ' ', text.lower().strip())
-    return hashlib.md5(normalized.encode('utf-8')).hexdigest()
-def format_file_size(size_bytes: int) -> str:
-    """Format file size in human readable format"""
-    if size_bytes == 0:
-        return "0 B"
-    size_names = ["B", "KB", "MB", "GB"]
-    i = 0
-    while size_bytes >= 1024 and i < len(size_names) - 1:
-        size_bytes /= 1024.0
-        i += 1
-    return f"{size_bytes:.1f} {size_names[i]}"
-def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
-    """Estimate reading time in minutes"""
-    word_count = len(text.split())
-    return max(1, round(word_count / words_per_minute))
-def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
-    """Truncate text to maximum length with suffix"""
-    if len(text) <= max_length:
-        return text
-    return text[:max_length - len(suffix)] + suffix
-def create_filename_safe_string(text: str, max_length: int = 50) -> str:
-    """Create a filesystem-safe string from text"""
-    # Remove/replace problematic characters
-    safe_text = re.sub(r'[<>:"/\\|?*]', '_', text)
-    safe_text = re.sub(r'\s+', '_', safe_text)
-    safe_text = safe_text.strip('._')
-    # Truncate if too long
-    if len(safe_text) > max_length:
-        safe_text = safe_text[:max_length].rstrip('_')
-    return safe_text or "untitled"
-def validate_dataset_format(data: List[Dict[str, Any]], required_fields: List[str]) -> Tuple[bool, List[str]]:
-    """
-    Validate dataset format against required fields
-    Returns:
-        Tuple of (is_valid, list_of_errors)
-    """
-    errors = []
-    if not data:
-        errors.append("Dataset is empty")
-        return False, errors
-    # Check each item
-    for i, item in enumerate(data[:10]):  # Check first 10 items
-        if not isinstance(item, dict):
-            errors.append(f"Item {i} is not a dictionary")
-            continue
-        # Check required fields
-        for field in required_fields:
-            if field not in item:
-                errors.append(f"Item {i} missing required field: {field}")
-            elif not item[field]:  # Check for empty values
-                errors.append(f"Item {i} has empty value for field: {field}")
-    return len(errors) == 0, errors
-def create_progress_message(current: int, total: int, operation: str = "Processing") -> str:
-    """Create a formatted progress message"""
-    percentage = (current / total * 100) if total > 0 else 0
-    return f"{operation} {current}/{total} ({percentage:.1f}%)"
-def sanitize_text_for_json(text: str) -> str:
-    """Sanitize text for safe JSON serialization"""
-    if not text:
-        return ""
-    # Replace problematic characters
-    text = text.replace('\x00', '')  # Remove null bytes
-    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)  # Remove control characters
-    return text
-def extract_domain_from_url(url: str) -> str:
-    """Extract domain from URL"""
-    try:
-        parsed = urlparse(url)
-        return parsed.netloc.lower()
-    except:
-        return "unknown"
-def analyze_text_quality(text: str) -> Dict[str, Any]:
-    """
-    Analyze text quality and return metrics
-    Returns:
-        Dictionary with quality metrics
-    """
-    if not text:
-        return {'score': 0.0, 'issues': ['Empty text']}
-    issues = []
-    score = 1.0
-    # Length checks
-    word_count = len(text.split())
-    if word_count < 10:
-        issues.append('Too short (< 10 words)')
-        score -= 0.3
-    elif word_count < 50:
-        score -= 0.1
-    # Character checks
-    if len(text) < 100:
-        issues.append('Very short content')
-        score -= 0.2
-    # Language quality checks
-    uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text)
-    if uppercase_ratio > 0.3:
-        issues.append('Excessive uppercase')
-        score -= 0.2
-    # Punctuation checks
-    sentence_endings = text.count('.') + text.count('!') + text.count('?')
-    if word_count > 50 and sentence_endings < 2:
-        issues.append('Few sentence endings')
-        score -= 0.1
-    # Excessive repetition check
-    words = text.lower().split()
-    if len(words) > 10:
-        unique_words = set(words)
-        if len(unique_words) / len(words) < 0.5:
-            issues.append('High word repetition')
-            score -= 0.2
-    # Special character checks
-    special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
-    if special_char_ratio > 0.1:
-        issues.append('Many special characters')
-        score -= 0.1
-    return {
-        'score': max(0.0, score),
-        'word_count': word_count,
-        'char_count': len(text),
-        'uppercase_ratio': uppercase_ratio,
-        'special_char_ratio': special_char_ratio,
-        'issues': issues
-    }
-# Dataset template utilities
-def create_classification_example(text: str, label: str, confidence: float = 1.0) -> Dict[str, Any]:
-    """Create a text classification example"""
-    return {
-        'text': text,
-        'label': label,
-        'confidence': confidence
-    }
-def create_ner_example(text: str, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Create a named entity recognition example"""
-    return {
-        'text': text,
-        'entities': entities
-    }
-def create_qa_example(context: str, question: str, answer: str, answer_start: int = None) -> Dict[str, Any]:
-    """Create a question answering example"""
-    example = {
-        'context': context,
-        'question': question,
-        'answer': answer
-    }
-    if answer_start is not None:
-        example['answer_start'] = answer_start
-    return example
-def create_summarization_example(text: str, summary: str) -> Dict[str, Any]:
-    """Create a text summarization example"""
-    return {
-        'text': text,
-        'summary': summary
-    }