MagicMeWizard commited on
Commit
135516a
·
verified ·
1 Parent(s): 23fb9fd

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -462
utils.py DELETED
@@ -1,462 +0,0 @@
1
- """
2
- Utility functions for AI Dataset Studio
3
- Common helpers for text processing, validation, and data manipulation
4
- """
5
-
6
- import re
7
- import hashlib
8
- import json
9
- import csv
10
- import io
11
- from typing import List, Dict, Any, Optional, Tuple, Union
12
- from urllib.parse import urlparse, urljoin
13
- from datetime import datetime
14
- import logging
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- def clean_text(text: str, aggressive: bool = False) -> str:
19
- """
20
- Clean text content with various strategies
21
-
22
- Args:
23
- text: Input text to clean
24
- aggressive: Whether to apply aggressive cleaning
25
-
26
- Returns:
27
- Cleaned text
28
- """
29
- if not text:
30
- return ""
31
-
32
- # Basic cleaning
33
- text = text.strip()
34
-
35
- # Remove excessive whitespace
36
- text = re.sub(r'\s+', ' ', text)
37
-
38
- # Remove URLs if aggressive
39
- if aggressive:
40
- text = re.sub(r'http\S+|www\.\S+', '', text)
41
- text = re.sub(r'\S+@\S+', '', text) # Email addresses
42
-
43
- # Fix common encoding issues
44
- text = text.replace('’', "'")
45
- text = text.replace('“', '"')
46
- text = text.replace('â€', '"')
47
- text = text.replace('â€"', '—')
48
-
49
- # Remove excessive punctuation
50
- text = re.sub(r'[!?]{3,}', '!!!', text)
51
- text = re.sub(r'\.{4,}', '...', text)
52
-
53
- # Clean up quotes and apostrophes
54
- text = re.sub(r'["""]', '"', text)
55
- text = re.sub(r'[''']', "'", text)
56
-
57
- return text.strip()
58
-
59
- def extract_urls_from_text(text: str) -> List[str]:
60
- """Extract URLs from text content"""
61
- url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
62
- urls = re.findall(url_pattern, text)
63
- return list(set(urls)) # Remove duplicates
64
-
65
- def validate_url(url: str) -> Tuple[bool, str]:
66
- """
67
- Validate URL format and basic security checks
68
-
69
- Returns:
70
- Tuple of (is_valid, error_message)
71
- """
72
- try:
73
- if not url or not url.strip():
74
- return False, "Empty URL"
75
-
76
- url = url.strip()
77
-
78
- # Basic format check
79
- parsed = urlparse(url)
80
-
81
- if not parsed.scheme:
82
- return False, "Missing scheme (http:// or https://)"
83
-
84
- if parsed.scheme not in ['http', 'https']:
85
- return False, f"Invalid scheme: {parsed.scheme}"
86
-
87
- if not parsed.netloc:
88
- return False, "Invalid domain"
89
-
90
- # Check for suspicious patterns
91
- suspicious_patterns = [
92
- r'localhost',
93
- r'127\.0\.0\.1',
94
- r'192\.168\.',
95
- r'10\.',
96
- r'172\.(1[6-9]|2[0-9]|3[01])\.'
97
- ]
98
-
99
- for pattern in suspicious_patterns:
100
- if re.search(pattern, parsed.netloc, re.IGNORECASE):
101
- return False, "Access to internal networks not allowed"
102
-
103
- return True, "Valid URL"
104
-
105
- except Exception as e:
106
- return False, f"URL validation error: {str(e)}"
107
-
108
- def parse_urls_from_file(file_content: bytes, filename: str) -> List[str]:
109
- """
110
- Parse URLs from uploaded file content
111
-
112
- Args:
113
- file_content: File content as bytes
114
- filename: Original filename for format detection
115
-
116
- Returns:
117
- List of extracted URLs
118
- """
119
- try:
120
- # Decode content
121
- try:
122
- content = file_content.decode('utf-8')
123
- except UnicodeDecodeError:
124
- content = file_content.decode('latin-1')
125
-
126
- urls = []
127
-
128
- # Handle different file formats
129
- if filename.lower().endswith('.csv'):
130
- # Try to parse as CSV
131
- reader = csv.DictReader(io.StringIO(content))
132
- for row in reader:
133
- # Look for URL column (flexible naming)
134
- url_columns = ['url', 'URL', 'link', 'Link', 'href', 'address']
135
- for col in url_columns:
136
- if col in row and row[col]:
137
- urls.append(row[col].strip())
138
- break
139
- else:
140
- # Treat as plain text (one URL per line)
141
- lines = content.split('\n')
142
- for line in lines:
143
- line = line.strip()
144
- if line and not line.startswith('#'): # Skip comments
145
- # Extract URLs from line
146
- extracted = extract_urls_from_text(line)
147
- if extracted:
148
- urls.extend(extracted)
149
- elif validate_url(line)[0]: # Check if line itself is a URL
150
- urls.append(line)
151
-
152
- # Remove duplicates while preserving order
153
- seen = set()
154
- unique_urls = []
155
- for url in urls:
156
- if url not in seen:
157
- seen.add(url)
158
- unique_urls.append(url)
159
-
160
- return unique_urls
161
-
162
- except Exception as e:
163
- logger.error(f"Error parsing URLs from file: {e}")
164
- return []
165
-
166
- def calculate_text_similarity(text1: str, text2: str) -> float:
167
- """
168
- Calculate similarity between two texts using simple methods
169
-
170
- Returns:
171
- Similarity score between 0 and 1
172
- """
173
- if not text1 or not text2:
174
- return 0.0
175
-
176
- # Simple character-level similarity
177
- text1 = text1.lower().strip()
178
- text2 = text2.lower().strip()
179
-
180
- if text1 == text2:
181
- return 1.0
182
-
183
- # Jaccard similarity on words
184
- words1 = set(text1.split())
185
- words2 = set(text2.split())
186
-
187
- if not words1 and not words2:
188
- return 1.0
189
- if not words1 or not words2:
190
- return 0.0
191
-
192
- intersection = len(words1.intersection(words2))
193
- union = len(words1.union(words2))
194
-
195
- return intersection / union if union > 0 else 0.0
196
-
197
- def detect_content_type(text: str) -> str:
198
- """
199
- Detect the type of content based on text analysis
200
-
201
- Returns:
202
- Content type string
203
- """
204
- if not text:
205
- return "empty"
206
-
207
- text_lower = text.lower()
208
-
209
- # Check for common patterns
210
- if any(word in text_lower for word in ['abstract:', 'introduction:', 'conclusion:', 'references:']):
211
- return "academic"
212
- elif any(word in text_lower for word in ['news', 'reported', 'according to', 'sources say']):
213
- return "news"
214
- elif any(word in text_lower for word in ['review', 'rating', 'stars', 'recommend']):
215
- return "review"
216
- elif any(word in text_lower for word in ['blog', 'posted by', 'share this']):
217
- return "blog"
218
- elif re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text):
219
- return "dated_content"
220
- else:
221
- return "general"
222
-
223
- def extract_metadata_from_text(text: str) -> Dict[str, Any]:
224
- """
225
- Extract metadata from text content
226
-
227
- Returns:
228
- Dictionary of extracted metadata
229
- """
230
- metadata = {}
231
-
232
- # Extract dates
233
- date_patterns = [
234
- r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
235
- r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
236
- r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'
237
- ]
238
-
239
- dates = []
240
- for pattern in date_patterns:
241
- dates.extend(re.findall(pattern, text, re.IGNORECASE))
242
-
243
- if dates:
244
- metadata['extracted_dates'] = dates[:5] # Limit to first 5
245
-
246
- # Extract numbers and statistics
247
- numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text)
248
- if numbers:
249
- metadata['numbers'] = numbers[:10] # Limit to first 10
250
-
251
- # Extract email addresses
252
- emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
253
- if emails:
254
- metadata['emails'] = emails[:5]
255
-
256
- # Extract phone numbers (basic pattern)
257
- phones = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)
258
- if phones:
259
- metadata['phones'] = phones[:5]
260
-
261
- # Extract capitalized words (potential names/entities)
262
- capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', text)
263
- if capitalized:
264
- # Filter common words
265
- common_words = {'The', 'This', 'That', 'There', 'Then', 'They', 'These', 'Those'}
266
- filtered = [word for word in capitalized if word not in common_words]
267
- metadata['capitalized_terms'] = list(set(filtered))[:20]
268
-
269
- return metadata
270
-
271
- def generate_content_hash(text: str) -> str:
272
- """Generate a hash for content deduplication"""
273
- # Normalize text for hashing
274
- normalized = re.sub(r'\s+', ' ', text.lower().strip())
275
- return hashlib.md5(normalized.encode('utf-8')).hexdigest()
276
-
277
- def format_file_size(size_bytes: int) -> str:
278
- """Format file size in human readable format"""
279
- if size_bytes == 0:
280
- return "0 B"
281
-
282
- size_names = ["B", "KB", "MB", "GB"]
283
- i = 0
284
- while size_bytes >= 1024 and i < len(size_names) - 1:
285
- size_bytes /= 1024.0
286
- i += 1
287
-
288
- return f"{size_bytes:.1f} {size_names[i]}"
289
-
290
- def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
291
- """Estimate reading time in minutes"""
292
- word_count = len(text.split())
293
- return max(1, round(word_count / words_per_minute))
294
-
295
- def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
296
- """Truncate text to maximum length with suffix"""
297
- if len(text) <= max_length:
298
- return text
299
-
300
- return text[:max_length - len(suffix)] + suffix
301
-
302
- def create_filename_safe_string(text: str, max_length: int = 50) -> str:
303
- """Create a filesystem-safe string from text"""
304
- # Remove/replace problematic characters
305
- safe_text = re.sub(r'[<>:"/\\|?*]', '_', text)
306
- safe_text = re.sub(r'\s+', '_', safe_text)
307
- safe_text = safe_text.strip('._')
308
-
309
- # Truncate if too long
310
- if len(safe_text) > max_length:
311
- safe_text = safe_text[:max_length].rstrip('_')
312
-
313
- return safe_text or "untitled"
314
-
315
- def validate_dataset_format(data: List[Dict[str, Any]], required_fields: List[str]) -> Tuple[bool, List[str]]:
316
- """
317
- Validate dataset format against required fields
318
-
319
- Returns:
320
- Tuple of (is_valid, list_of_errors)
321
- """
322
- errors = []
323
-
324
- if not data:
325
- errors.append("Dataset is empty")
326
- return False, errors
327
-
328
- # Check each item
329
- for i, item in enumerate(data[:10]): # Check first 10 items
330
- if not isinstance(item, dict):
331
- errors.append(f"Item {i} is not a dictionary")
332
- continue
333
-
334
- # Check required fields
335
- for field in required_fields:
336
- if field not in item:
337
- errors.append(f"Item {i} missing required field: {field}")
338
- elif not item[field]: # Check for empty values
339
- errors.append(f"Item {i} has empty value for field: {field}")
340
-
341
- return len(errors) == 0, errors
342
-
343
- def create_progress_message(current: int, total: int, operation: str = "Processing") -> str:
344
- """Create a formatted progress message"""
345
- percentage = (current / total * 100) if total > 0 else 0
346
- return f"{operation} {current}/{total} ({percentage:.1f}%)"
347
-
348
- def sanitize_text_for_json(text: str) -> str:
349
- """Sanitize text for safe JSON serialization"""
350
- if not text:
351
- return ""
352
-
353
- # Replace problematic characters
354
- text = text.replace('\x00', '') # Remove null bytes
355
- text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text) # Remove control characters
356
-
357
- return text
358
-
359
- def extract_domain_from_url(url: str) -> str:
360
- """Extract domain from URL"""
361
- try:
362
- parsed = urlparse(url)
363
- return parsed.netloc.lower()
364
- except:
365
- return "unknown"
366
-
367
- def analyze_text_quality(text: str) -> Dict[str, Any]:
368
- """
369
- Analyze text quality and return metrics
370
-
371
- Returns:
372
- Dictionary with quality metrics
373
- """
374
- if not text:
375
- return {'score': 0.0, 'issues': ['Empty text']}
376
-
377
- issues = []
378
- score = 1.0
379
-
380
- # Length checks
381
- word_count = len(text.split())
382
- if word_count < 10:
383
- issues.append('Too short (< 10 words)')
384
- score -= 0.3
385
- elif word_count < 50:
386
- score -= 0.1
387
-
388
- # Character checks
389
- if len(text) < 100:
390
- issues.append('Very short content')
391
- score -= 0.2
392
-
393
- # Language quality checks
394
- uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text)
395
- if uppercase_ratio > 0.3:
396
- issues.append('Excessive uppercase')
397
- score -= 0.2
398
-
399
- # Punctuation checks
400
- sentence_endings = text.count('.') + text.count('!') + text.count('?')
401
- if word_count > 50 and sentence_endings < 2:
402
- issues.append('Few sentence endings')
403
- score -= 0.1
404
-
405
- # Excessive repetition check
406
- words = text.lower().split()
407
- if len(words) > 10:
408
- unique_words = set(words)
409
- if len(unique_words) / len(words) < 0.5:
410
- issues.append('High word repetition')
411
- score -= 0.2
412
-
413
- # Special character checks
414
- special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
415
- if special_char_ratio > 0.1:
416
- issues.append('Many special characters')
417
- score -= 0.1
418
-
419
- return {
420
- 'score': max(0.0, score),
421
- 'word_count': word_count,
422
- 'char_count': len(text),
423
- 'uppercase_ratio': uppercase_ratio,
424
- 'special_char_ratio': special_char_ratio,
425
- 'issues': issues
426
- }
427
-
428
- # Dataset template utilities
429
- def create_classification_example(text: str, label: str, confidence: float = 1.0) -> Dict[str, Any]:
430
- """Create a text classification example"""
431
- return {
432
- 'text': text,
433
- 'label': label,
434
- 'confidence': confidence
435
- }
436
-
437
- def create_ner_example(text: str, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
438
- """Create a named entity recognition example"""
439
- return {
440
- 'text': text,
441
- 'entities': entities
442
- }
443
-
444
- def create_qa_example(context: str, question: str, answer: str, answer_start: int = None) -> Dict[str, Any]:
445
- """Create a question answering example"""
446
- example = {
447
- 'context': context,
448
- 'question': question,
449
- 'answer': answer
450
- }
451
-
452
- if answer_start is not None:
453
- example['answer_start'] = answer_start
454
-
455
- return example
456
-
457
- def create_summarization_example(text: str, summary: str) -> Dict[str, Any]:
458
- """Create a text summarization example"""
459
- return {
460
- 'text': text,
461
- 'summary': summary
462
- }