MagicMeWizard commited on
Commit
4fc0c1e
·
verified ·
1 Parent(s): 135516a

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +462 -0
utils.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for AI Dataset Studio
3
+ Common helpers for text processing, validation, and data manipulation
4
+ """
5
+
6
+ import re
7
+ import hashlib
8
+ import json
9
+ import csv
10
+ import io
11
+ from typing import List, Dict, Any, Optional, Tuple, Union
12
+ from urllib.parse import urlparse, urljoin
13
+ from datetime import datetime
14
+ import logging
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def clean_text(text: str, aggressive: bool = False) -> str:
19
+ """
20
+ Clean text content with various strategies
21
+
22
+ Args:
23
+ text: Input text to clean
24
+ aggressive: Whether to apply aggressive cleaning
25
+
26
+ Returns:
27
+ Cleaned text
28
+ """
29
+ if not text:
30
+ return ""
31
+
32
+ # Basic cleaning
33
+ text = text.strip()
34
+
35
+ # Remove excessive whitespace
36
+ text = re.sub(r'\s+', ' ', text)
37
+
38
+ # Remove URLs if aggressive
39
+ if aggressive:
40
+ text = re.sub(r'http\S+|www\.\S+', '', text)
41
+ text = re.sub(r'\S+@\S+', '', text) # Email addresses
42
+
43
+ # Fix common encoding issues
44
+ text = text.replace('’', "'")
45
+ text = text.replace('“', '"')
46
+ text = text.replace('â€', '"')
47
+ text = text.replace('â€"', '—')
48
+
49
+ # Remove excessive punctuation
50
+ text = re.sub(r'[!?]{3,}', '!!!', text)
51
+ text = re.sub(r'\.{4,}', '...', text)
52
+
53
+ # Clean up quotes and apostrophes
54
+ text = re.sub(r'["""]', '"', text)
55
+ text = re.sub(r'[''']', "'", text)
56
+
57
+ return text.strip()
58
+
59
+ def extract_urls_from_text(text: str) -> List[str]:
60
+ """Extract URLs from text content"""
61
+ url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
62
+ urls = re.findall(url_pattern, text)
63
+ return list(set(urls)) # Remove duplicates
64
+
65
+ def validate_url(url: str) -> Tuple[bool, str]:
66
+ """
67
+ Validate URL format and basic security checks
68
+
69
+ Returns:
70
+ Tuple of (is_valid, error_message)
71
+ """
72
+ try:
73
+ if not url or not url.strip():
74
+ return False, "Empty URL"
75
+
76
+ url = url.strip()
77
+
78
+ # Basic format check
79
+ parsed = urlparse(url)
80
+
81
+ if not parsed.scheme:
82
+ return False, "Missing scheme (http:// or https://)"
83
+
84
+ if parsed.scheme not in ['http', 'https']:
85
+ return False, f"Invalid scheme: {parsed.scheme}"
86
+
87
+ if not parsed.netloc:
88
+ return False, "Invalid domain"
89
+
90
+ # Check for suspicious patterns
91
+ suspicious_patterns = [
92
+ r'localhost',
93
+ r'127\.0\.0\.1',
94
+ r'192\.168\.',
95
+ r'10\.',
96
+ r'172\.(1[6-9]|2[0-9]|3[01])\.'
97
+ ]
98
+
99
+ for pattern in suspicious_patterns:
100
+ if re.search(pattern, parsed.netloc, re.IGNORECASE):
101
+ return False, "Access to internal networks not allowed"
102
+
103
+ return True, "Valid URL"
104
+
105
+ except Exception as e:
106
+ return False, f"URL validation error: {str(e)}"
107
+
108
+ def parse_urls_from_file(file_content: bytes, filename: str) -> List[str]:
109
+ """
110
+ Parse URLs from uploaded file content
111
+
112
+ Args:
113
+ file_content: File content as bytes
114
+ filename: Original filename for format detection
115
+
116
+ Returns:
117
+ List of extracted URLs
118
+ """
119
+ try:
120
+ # Decode content
121
+ try:
122
+ content = file_content.decode('utf-8')
123
+ except UnicodeDecodeError:
124
+ content = file_content.decode('latin-1')
125
+
126
+ urls = []
127
+
128
+ # Handle different file formats
129
+ if filename.lower().endswith('.csv'):
130
+ # Try to parse as CSV
131
+ reader = csv.DictReader(io.StringIO(content))
132
+ for row in reader:
133
+ # Look for URL column (flexible naming)
134
+ url_columns = ['url', 'URL', 'link', 'Link', 'href', 'address']
135
+ for col in url_columns:
136
+ if col in row and row[col]:
137
+ urls.append(row[col].strip())
138
+ break
139
+ else:
140
+ # Treat as plain text (one URL per line)
141
+ lines = content.split('\n')
142
+ for line in lines:
143
+ line = line.strip()
144
+ if line and not line.startswith('#'): # Skip comments
145
+ # Extract URLs from line
146
+ extracted = extract_urls_from_text(line)
147
+ if extracted:
148
+ urls.extend(extracted)
149
+ elif validate_url(line)[0]: # Check if line itself is a URL
150
+ urls.append(line)
151
+
152
+ # Remove duplicates while preserving order
153
+ seen = set()
154
+ unique_urls = []
155
+ for url in urls:
156
+ if url not in seen:
157
+ seen.add(url)
158
+ unique_urls.append(url)
159
+
160
+ return unique_urls
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error parsing URLs from file: {e}")
164
+ return []
165
+
166
+ def calculate_text_similarity(text1: str, text2: str) -> float:
167
+ """
168
+ Calculate similarity between two texts using simple methods
169
+
170
+ Returns:
171
+ Similarity score between 0 and 1
172
+ """
173
+ if not text1 or not text2:
174
+ return 0.0
175
+
176
+ # Simple character-level similarity
177
+ text1 = text1.lower().strip()
178
+ text2 = text2.lower().strip()
179
+
180
+ if text1 == text2:
181
+ return 1.0
182
+
183
+ # Jaccard similarity on words
184
+ words1 = set(text1.split())
185
+ words2 = set(text2.split())
186
+
187
+ if not words1 and not words2:
188
+ return 1.0
189
+ if not words1 or not words2:
190
+ return 0.0
191
+
192
+ intersection = len(words1.intersection(words2))
193
+ union = len(words1.union(words2))
194
+
195
+ return intersection / union if union > 0 else 0.0
196
+
197
+ def detect_content_type(text: str) -> str:
198
+ """
199
+ Detect the type of content based on text analysis
200
+
201
+ Returns:
202
+ Content type string
203
+ """
204
+ if not text:
205
+ return "empty"
206
+
207
+ text_lower = text.lower()
208
+
209
+ # Check for common patterns
210
+ if any(word in text_lower for word in ['abstract:', 'introduction:', 'conclusion:', 'references:']):
211
+ return "academic"
212
+ elif any(word in text_lower for word in ['news', 'reported', 'according to', 'sources say']):
213
+ return "news"
214
+ elif any(word in text_lower for word in ['review', 'rating', 'stars', 'recommend']):
215
+ return "review"
216
+ elif any(word in text_lower for word in ['blog', 'posted by', 'share this']):
217
+ return "blog"
218
+ elif re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text):
219
+ return "dated_content"
220
+ else:
221
+ return "general"
222
+
223
+ def extract_metadata_from_text(text: str) -> Dict[str, Any]:
224
+ """
225
+ Extract metadata from text content
226
+
227
+ Returns:
228
+ Dictionary of extracted metadata
229
+ """
230
+ metadata = {}
231
+
232
+ # Extract dates
233
+ date_patterns = [
234
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
235
+ r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
236
+ r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'
237
+ ]
238
+
239
+ dates = []
240
+ for pattern in date_patterns:
241
+ dates.extend(re.findall(pattern, text, re.IGNORECASE))
242
+
243
+ if dates:
244
+ metadata['extracted_dates'] = dates[:5] # Limit to first 5
245
+
246
+ # Extract numbers and statistics
247
+ numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text)
248
+ if numbers:
249
+ metadata['numbers'] = numbers[:10] # Limit to first 10
250
+
251
+ # Extract email addresses
252
+ emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
253
+ if emails:
254
+ metadata['emails'] = emails[:5]
255
+
256
+ # Extract phone numbers (basic pattern)
257
+ phones = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)
258
+ if phones:
259
+ metadata['phones'] = phones[:5]
260
+
261
+ # Extract capitalized words (potential names/entities)
262
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', text)
263
+ if capitalized:
264
+ # Filter common words
265
+ common_words = {'The', 'This', 'That', 'There', 'Then', 'They', 'These', 'Those'}
266
+ filtered = [word for word in capitalized if word not in common_words]
267
+ metadata['capitalized_terms'] = list(set(filtered))[:20]
268
+
269
+ return metadata
270
+
271
+ def generate_content_hash(text: str) -> str:
272
+ """Generate a hash for content deduplication"""
273
+ # Normalize text for hashing
274
+ normalized = re.sub(r'\s+', ' ', text.lower().strip())
275
+ return hashlib.md5(normalized.encode('utf-8')).hexdigest()
276
+
277
+ def format_file_size(size_bytes: int) -> str:
278
+ """Format file size in human readable format"""
279
+ if size_bytes == 0:
280
+ return "0 B"
281
+
282
+ size_names = ["B", "KB", "MB", "GB"]
283
+ i = 0
284
+ while size_bytes >= 1024 and i < len(size_names) - 1:
285
+ size_bytes /= 1024.0
286
+ i += 1
287
+
288
+ return f"{size_bytes:.1f} {size_names[i]}"
289
+
290
+ def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
291
+ """Estimate reading time in minutes"""
292
+ word_count = len(text.split())
293
+ return max(1, round(word_count / words_per_minute))
294
+
295
+ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
296
+ """Truncate text to maximum length with suffix"""
297
+ if len(text) <= max_length:
298
+ return text
299
+
300
+ return text[:max_length - len(suffix)] + suffix
301
+
302
+ def create_filename_safe_string(text: str, max_length: int = 50) -> str:
303
+ """Create a filesystem-safe string from text"""
304
+ # Remove/replace problematic characters
305
+ safe_text = re.sub(r'[<>:"/\\|?*]', '_', text)
306
+ safe_text = re.sub(r'\s+', '_', safe_text)
307
+ safe_text = safe_text.strip('._')
308
+
309
+ # Truncate if too long
310
+ if len(safe_text) > max_length:
311
+ safe_text = safe_text[:max_length].rstrip('_')
312
+
313
+ return safe_text or "untitled"
314
+
315
+ def validate_dataset_format(data: List[Dict[str, Any]], required_fields: List[str]) -> Tuple[bool, List[str]]:
316
+ """
317
+ Validate dataset format against required fields
318
+
319
+ Returns:
320
+ Tuple of (is_valid, list_of_errors)
321
+ """
322
+ errors = []
323
+
324
+ if not data:
325
+ errors.append("Dataset is empty")
326
+ return False, errors
327
+
328
+ # Check each item
329
+ for i, item in enumerate(data[:10]): # Check first 10 items
330
+ if not isinstance(item, dict):
331
+ errors.append(f"Item {i} is not a dictionary")
332
+ continue
333
+
334
+ # Check required fields
335
+ for field in required_fields:
336
+ if field not in item:
337
+ errors.append(f"Item {i} missing required field: {field}")
338
+ elif not item[field]: # Check for empty values
339
+ errors.append(f"Item {i} has empty value for field: {field}")
340
+
341
+ return len(errors) == 0, errors
342
+
343
+ def create_progress_message(current: int, total: int, operation: str = "Processing") -> str:
344
+ """Create a formatted progress message"""
345
+ percentage = (current / total * 100) if total > 0 else 0
346
+ return f"{operation} {current}/{total} ({percentage:.1f}%)"
347
+
348
+ def sanitize_text_for_json(text: str) -> str:
349
+ """Sanitize text for safe JSON serialization"""
350
+ if not text:
351
+ return ""
352
+
353
+ # Replace problematic characters
354
+ text = text.replace('\x00', '') # Remove null bytes
355
+ text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text) # Remove control characters
356
+
357
+ return text
358
+
359
+ def extract_domain_from_url(url: str) -> str:
360
+ """Extract domain from URL"""
361
+ try:
362
+ parsed = urlparse(url)
363
+ return parsed.netloc.lower()
364
+ except:
365
+ return "unknown"
366
+
367
+ def analyze_text_quality(text: str) -> Dict[str, Any]:
368
+ """
369
+ Analyze text quality and return metrics
370
+
371
+ Returns:
372
+ Dictionary with quality metrics
373
+ """
374
+ if not text:
375
+ return {'score': 0.0, 'issues': ['Empty text']}
376
+
377
+ issues = []
378
+ score = 1.0
379
+
380
+ # Length checks
381
+ word_count = len(text.split())
382
+ if word_count < 10:
383
+ issues.append('Too short (< 10 words)')
384
+ score -= 0.3
385
+ elif word_count < 50:
386
+ score -= 0.1
387
+
388
+ # Character checks
389
+ if len(text) < 100:
390
+ issues.append('Very short content')
391
+ score -= 0.2
392
+
393
+ # Language quality checks
394
+ uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text)
395
+ if uppercase_ratio > 0.3:
396
+ issues.append('Excessive uppercase')
397
+ score -= 0.2
398
+
399
+ # Punctuation checks
400
+ sentence_endings = text.count('.') + text.count('!') + text.count('?')
401
+ if word_count > 50 and sentence_endings < 2:
402
+ issues.append('Few sentence endings')
403
+ score -= 0.1
404
+
405
+ # Excessive repetition check
406
+ words = text.lower().split()
407
+ if len(words) > 10:
408
+ unique_words = set(words)
409
+ if len(unique_words) / len(words) < 0.5:
410
+ issues.append('High word repetition')
411
+ score -= 0.2
412
+
413
+ # Special character checks
414
+ special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
415
+ if special_char_ratio > 0.1:
416
+ issues.append('Many special characters')
417
+ score -= 0.1
418
+
419
+ return {
420
+ 'score': max(0.0, score),
421
+ 'word_count': word_count,
422
+ 'char_count': len(text),
423
+ 'uppercase_ratio': uppercase_ratio,
424
+ 'special_char_ratio': special_char_ratio,
425
+ 'issues': issues
426
+ }
427
+
428
+ # Dataset template utilities
429
+ def create_classification_example(text: str, label: str, confidence: float = 1.0) -> Dict[str, Any]:
430
+ """Create a text classification example"""
431
+ return {
432
+ 'text': text,
433
+ 'label': label,
434
+ 'confidence': confidence
435
+ }
436
+
437
+ def create_ner_example(text: str, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
438
+ """Create a named entity recognition example"""
439
+ return {
440
+ 'text': text,
441
+ 'entities': entities
442
+ }
443
+
444
+ def create_qa_example(context: str, question: str, answer: str, answer_start: int = None) -> Dict[str, Any]:
445
+ """Create a question answering example"""
446
+ example = {
447
+ 'context': context,
448
+ 'question': question,
449
+ 'answer': answer
450
+ }
451
+
452
+ if answer_start is not None:
453
+ example['answer_start'] = answer_start
454
+
455
+ return example
456
+
457
+ def create_summarization_example(text: str, summary: str) -> Dict[str, Any]:
458
+ """Create a text summarization example"""
459
+ return {
460
+ 'text': text,
461
+ 'summary': summary
462
+ }