|
""" |
|
Utility functions for AI Dataset Studio |
|
Common helpers for text processing, validation, and data manipulation |
|
""" |
|
|
|
import re |
|
import hashlib |
|
import json |
|
import csv |
|
import io |
|
from typing import List, Dict, Any, Optional, Tuple, Union |
|
from urllib.parse import urlparse, urljoin |
|
from datetime import datetime |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def clean_text(text: str, aggressive: bool = False) -> str: |
|
""" |
|
Clean text content with various strategies |
|
|
|
Args: |
|
text: Input text to clean |
|
aggressive: Whether to apply aggressive cleaning |
|
|
|
Returns: |
|
Cleaned text |
|
""" |
|
if not text: |
|
return "" |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
if aggressive: |
|
text = re.sub(r'http\S+|www\.\S+', '', text) |
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
text = text.replace('’', "'") |
|
text = text.replace('“', '"') |
|
text = text.replace('â€', '"') |
|
text = text.replace('â€"', '—') |
|
|
|
|
|
text = re.sub(r'[!?]{3,}', '!!!', text) |
|
text = re.sub(r'\.{4,}', '...', text) |
|
|
|
|
|
text = re.sub(r'["""]', '"', text) |
|
text = re.sub(r'[''']', "'", text) |
|
|
|
return text.strip() |
|
|
|
def extract_urls_from_text(text: str) -> List[str]: |
|
"""Extract URLs from text content""" |
|
url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?' |
|
urls = re.findall(url_pattern, text) |
|
return list(set(urls)) # Remove duplicates |
|
|
|
def validate_url(url: str) -> Tuple[bool, str]: |
|
""" |
|
Validate URL format and basic security checks |
|
|
|
Returns: |
|
Tuple of (is_valid, error_message) |
|
""" |
|
try: |
|
if not url or not url.strip(): |
|
return False, "Empty URL" |
|
|
|
url = url.strip() |
|
|
|
# Basic format check |
|
parsed = urlparse(url) |
|
|
|
if not parsed.scheme: |
|
return False, "Missing scheme (http:// or https://)" |
|
|
|
if parsed.scheme not in ['http', 'https']: |
|
return False, f"Invalid scheme: {parsed.scheme}" |
|
|
|
if not parsed.netloc: |
|
return False, "Invalid domain" |
|
|
|
# Check for suspicious patterns |
|
suspicious_patterns = [ |
|
r'localhost', |
|
r'127\.0\.0\.1', |
|
r'192\.168\.', |
|
r'10\.', |
|
r'172\.(1[6-9]|2[0-9]|3[01])\.' |
|
] |
|
|
|
for pattern in suspicious_patterns: |
|
if re.search(pattern, parsed.netloc, re.IGNORECASE): |
|
return False, "Access to internal networks not allowed" |
|
|
|
return True, "Valid URL" |
|
|
|
except Exception as e: |
|
return False, f"URL validation error: {str(e)}" |
|
|
|
def parse_urls_from_file(file_content: bytes, filename: str) -> List[str]: |
|
""" |
|
Parse URLs from uploaded file content |
|
|
|
Args: |
|
file_content: File content as bytes |
|
filename: Original filename for format detection |
|
|
|
Returns: |
|
List of extracted URLs |
|
""" |
|
try: |
|
# Decode content |
|
try: |
|
content = file_content.decode('utf-8') |
|
except UnicodeDecodeError: |
|
content = file_content.decode('latin-1') |
|
|
|
urls = [] |
|
|
|
# Handle different file formats |
|
if filename.lower().endswith('.csv'): |
|
# Try to parse as CSV |
|
reader = csv.DictReader(io.StringIO(content)) |
|
for row in reader: |
|
# Look for URL column (flexible naming) |
|
url_columns = ['url', 'URL', 'link', 'Link', 'href', 'address'] |
|
for col in url_columns: |
|
if col in row and row[col]: |
|
urls.append(row[col].strip()) |
|
break |
|
else: |
|
# Treat as plain text (one URL per line) |
|
lines = content.split('\n') |
|
for line in lines: |
|
line = line.strip() |
|
if line and not line.startswith('#'): # Skip comments |
|
# Extract URLs from line |
|
extracted = extract_urls_from_text(line) |
|
if extracted: |
|
urls.extend(extracted) |
|
elif validate_url(line)[0]: # Check if line itself is a URL |
|
urls.append(line) |
|
|
|
# Remove duplicates while preserving order |
|
seen = set() |
|
unique_urls = [] |
|
for url in urls: |
|
if url not in seen: |
|
seen.add(url) |
|
unique_urls.append(url) |
|
|
|
return unique_urls |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing URLs from file: {e}") |
|
return [] |
|
|
|
def calculate_text_similarity(text1: str, text2: str) -> float: |
|
""" |
|
Calculate similarity between two texts using simple methods |
|
|
|
Returns: |
|
Similarity score between 0 and 1 |
|
""" |
|
if not text1 or not text2: |
|
return 0.0 |
|
|
|
# Simple character-level similarity |
|
text1 = text1.lower().strip() |
|
text2 = text2.lower().strip() |
|
|
|
if text1 == text2: |
|
return 1.0 |
|
|
|
# Jaccard similarity on words |
|
words1 = set(text1.split()) |
|
words2 = set(text2.split()) |
|
|
|
if not words1 and not words2: |
|
return 1.0 |
|
if not words1 or not words2: |
|
return 0.0 |
|
|
|
intersection = len(words1.intersection(words2)) |
|
union = len(words1.union(words2)) |
|
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
def detect_content_type(text: str) -> str: |
|
""" |
|
Detect the type of content based on text analysis |
|
|
|
Returns: |
|
Content type string |
|
""" |
|
if not text: |
|
return "empty" |
|
|
|
text_lower = text.lower() |
|
|
|
# Check for common patterns |
|
if any(word in text_lower for word in ['abstract:', 'introduction:', 'conclusion:', 'references:']): |
|
return "academic" |
|
elif any(word in text_lower for word in ['news', 'reported', 'according to', 'sources say']): |
|
return "news" |
|
elif any(word in text_lower for word in ['review', 'rating', 'stars', 'recommend']): |
|
return "review" |
|
elif any(word in text_lower for word in ['blog', 'posted by', 'share this']): |
|
return "blog" |
|
elif re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text): |
|
return "dated_content" |
|
else: |
|
return "general" |
|
|
|
def extract_metadata_from_text(text: str) -> Dict[str, Any]: |
|
""" |
|
Extract metadata from text content |
|
|
|
Returns: |
|
Dictionary of extracted metadata |
|
""" |
|
metadata = {} |
|
|
|
# Extract dates |
|
date_patterns = [ |
|
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', |
|
r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', |
|
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' |
|
] |
|
|
|
dates = [] |
|
for pattern in date_patterns: |
|
dates.extend(re.findall(pattern, text, re.IGNORECASE)) |
|
|
|
if dates: |
|
metadata['extracted_dates'] = dates[:5] # Limit to first 5 |
|
|
|
# Extract numbers and statistics |
|
numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text) |
|
if numbers: |
|
metadata['numbers'] = numbers[:10] # Limit to first 10 |
|
|
|
# Extract email addresses |
|
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) |
|
if emails: |
|
metadata['emails'] = emails[:5] |
|
|
|
# Extract phone numbers (basic pattern) |
|
phones = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text) |
|
if phones: |
|
metadata['phones'] = phones[:5] |
|
|
|
# Extract capitalized words (potential names/entities) |
|
capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', text) |
|
if capitalized: |
|
# Filter common words |
|
common_words = {'The', 'This', 'That', 'There', 'Then', 'They', 'These', 'Those'} |
|
filtered = [word for word in capitalized if word not in common_words] |
|
metadata['capitalized_terms'] = list(set(filtered))[:20] |
|
|
|
return metadata |
|
|
|
def generate_content_hash(text: str) -> str: |
|
"""Generate a hash for content deduplication""" |
|
# Normalize text for hashing |
|
normalized = re.sub(r'\s+', ' ', text.lower().strip()) |
|
return hashlib.md5(normalized.encode('utf-8')).hexdigest() |
|
|
|
def format_file_size(size_bytes: int) -> str: |
|
"""Format file size in human readable format""" |
|
if size_bytes == 0: |
|
return "0 B" |
|
|
|
size_names = ["B", "KB", "MB", "GB"] |
|
i = 0 |
|
while size_bytes >= 1024 and i < len(size_names) - 1: |
|
size_bytes /= 1024.0 |
|
i += 1 |
|
|
|
return f"{size_bytes:.1f} {size_names[i]}" |
|
|
|
def estimate_reading_time(text: str, words_per_minute: int = 200) -> int: |
|
"""Estimate reading time in minutes""" |
|
word_count = len(text.split()) |
|
return max(1, round(word_count / words_per_minute)) |
|
|
|
def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: |
|
"""Truncate text to maximum length with suffix""" |
|
if len(text) <= max_length: |
|
return text |
|
|
|
return text[:max_length - len(suffix)] + suffix |
|
|
|
def create_filename_safe_string(text: str, max_length: int = 50) -> str: |
|
"""Create a filesystem-safe string from text""" |
|
# Remove/replace problematic characters |
|
safe_text = re.sub(r'[<>:"/\\|?*]', '_', text) |
|
safe_text = re.sub(r'\s+', '_', safe_text) |
|
safe_text = safe_text.strip('._') |
|
|
|
# Truncate if too long |
|
if len(safe_text) > max_length: |
|
safe_text = safe_text[:max_length].rstrip('_') |
|
|
|
return safe_text or "untitled" |
|
|
|
def validate_dataset_format(data: List[Dict[str, Any]], required_fields: List[str]) -> Tuple[bool, List[str]]: |
|
""" |
|
Validate dataset format against required fields |
|
|
|
Returns: |
|
Tuple of (is_valid, list_of_errors) |
|
""" |
|
errors = [] |
|
|
|
if not data: |
|
errors.append("Dataset is empty") |
|
return False, errors |
|
|
|
# Check each item |
|
for i, item in enumerate(data[:10]): # Check first 10 items |
|
if not isinstance(item, dict): |
|
errors.append(f"Item {i} is not a dictionary") |
|
continue |
|
|
|
# Check required fields |
|
for field in required_fields: |
|
if field not in item: |
|
errors.append(f"Item {i} missing required field: {field}") |
|
elif not item[field]: # Check for empty values |
|
errors.append(f"Item {i} has empty value for field: {field}") |
|
|
|
return len(errors) == 0, errors |
|
|
|
def create_progress_message(current: int, total: int, operation: str = "Processing") -> str: |
|
"""Create a formatted progress message""" |
|
percentage = (current / total * 100) if total > 0 else 0 |
|
return f"{operation} {current}/{total} ({percentage:.1f}%)" |
|
|
|
def sanitize_text_for_json(text: str) -> str: |
|
"""Sanitize text for safe JSON serialization""" |
|
if not text: |
|
return "" |
|
|
|
# Replace problematic characters |
|
text = text.replace('\x00', '') # Remove null bytes |
|
text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text) # Remove control characters |
|
|
|
return text |
|
|
|
def extract_domain_from_url(url: str) -> str: |
|
"""Extract domain from URL""" |
|
try: |
|
parsed = urlparse(url) |
|
return parsed.netloc.lower() |
|
except: |
|
return "unknown" |
|
|
|
def analyze_text_quality(text: str) -> Dict[str, Any]: |
|
""" |
|
Analyze text quality and return metrics |
|
|
|
Returns: |
|
Dictionary with quality metrics |
|
""" |
|
if not text: |
|
return {'score': 0.0, 'issues': ['Empty text']} |
|
|
|
issues = [] |
|
score = 1.0 |
|
|
|
# Length checks |
|
word_count = len(text.split()) |
|
if word_count < 10: |
|
issues.append('Too short (< 10 words)') |
|
score -= 0.3 |
|
elif word_count < 50: |
|
score -= 0.1 |
|
|
|
# Character checks |
|
if len(text) < 100: |
|
issues.append('Very short content') |
|
score -= 0.2 |
|
|
|
# Language quality checks |
|
uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text) |
|
if uppercase_ratio > 0.3: |
|
issues.append('Excessive uppercase') |
|
score -= 0.2 |
|
|
|
# Punctuation checks |
|
sentence_endings = text.count('.') + text.count('!') + text.count('?') |
|
if word_count > 50 and sentence_endings < 2: |
|
issues.append('Few sentence endings') |
|
score -= 0.1 |
|
|
|
# Excessive repetition check |
|
words = text.lower().split() |
|
if len(words) > 10: |
|
unique_words = set(words) |
|
if len(unique_words) / len(words) < 0.5: |
|
issues.append('High word repetition') |
|
score -= 0.2 |
|
|
|
# Special character checks |
|
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text) |
|
if special_char_ratio > 0.1: |
|
issues.append('Many special characters') |
|
score -= 0.1 |
|
|
|
return { |
|
'score': max(0.0, score), |
|
'word_count': word_count, |
|
'char_count': len(text), |
|
'uppercase_ratio': uppercase_ratio, |
|
'special_char_ratio': special_char_ratio, |
|
'issues': issues |
|
} |
|
|
|
# Dataset template utilities |
|
def create_classification_example(text: str, label: str, confidence: float = 1.0) -> Dict[str, Any]: |
|
"""Create a text classification example""" |
|
return { |
|
'text': text, |
|
'label': label, |
|
'confidence': confidence |
|
} |
|
|
|
def create_ner_example(text: str, entities: List[Dict[str, Any]]) -> Dict[str, Any]: |
|
"""Create a named entity recognition example""" |
|
return { |
|
'text': text, |
|
'entities': entities |
|
} |
|
|
|
def create_qa_example(context: str, question: str, answer: str, answer_start: int = None) -> Dict[str, Any]: |
|
"""Create a question answering example""" |
|
example = { |
|
'context': context, |
|
'question': question, |
|
'answer': answer |
|
} |
|
|
|
if answer_start is not None: |
|
example['answer_start'] = answer_start |
|
|
|
return example |
|
|
|
def create_summarization_example(text: str, summary: str) -> Dict[str, Any]: |
|
"""Create a text summarization example""" |
|
return { |
|
'text': text, |
|
'summary': summary |
|
} |