MagicMeWizard commited on
Commit
6d85bb5
·
verified ·
1 Parent(s): f9f65ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +383 -732
app.py CHANGED
@@ -1,16 +1,6 @@
1
  """
2
- AI Dataset Studio - Modern Web Scraping & Dataset Creation Platform
3
- A mini Scale AI for non-coders and vibe coders
4
-
5
- Features:
6
- - Intelligent web scraping with content extraction
7
- - Automated data cleaning and preprocessing
8
- - Interactive annotation tools
9
- - Template-based workflows for common ML tasks
10
- - High-quality dataset generation
11
- - Export to HuggingFace Hub and popular ML formats
12
- - Visual data quality metrics
13
- - No-code dataset creation workflows
14
  """
15
 
16
  import gradio as gr
@@ -31,12 +21,10 @@ import hashlib
31
  import time
32
  from collections import defaultdict
33
  import io
34
- import zipfile
35
 
36
  # Optional imports with fallbacks
37
  try:
38
  from transformers import pipeline, AutoTokenizer, AutoModel
39
- from sentence_transformers import SentenceTransformer
40
  HAS_TRANSFORMERS = True
41
  except ImportError:
42
  HAS_TRANSFORMERS = False
@@ -44,7 +32,6 @@ except ImportError:
44
  try:
45
  import nltk
46
  from nltk.tokenize import sent_tokenize, word_tokenize
47
- from nltk.corpus import stopwords
48
  HAS_NLTK = True
49
  except ImportError:
50
  HAS_NLTK = False
@@ -94,53 +81,65 @@ class DatasetTemplate:
94
  """Template for dataset creation"""
95
  name: str
96
  description: str
97
- task_type: str # classification, ner, qa, summarization, etc.
98
  required_fields: List[str]
99
  optional_fields: List[str]
100
  example_format: Dict[str, Any]
101
  instructions: str
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  class WebScraperEngine:
104
- """Advanced web scraping engine with smart content extraction"""
105
 
106
  def __init__(self):
107
  self.session = requests.Session()
108
  self.session.headers.update({
109
- 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0; Research)',
110
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
111
  'Accept-Language': 'en-US,en;q=0.5',
112
- 'Accept-Encoding': 'gzip, deflate',
113
  'Connection': 'keep-alive',
114
  })
115
-
116
- # Initialize AI models if available
117
- self.content_classifier = None
118
- self.quality_scorer = None
119
- self._load_models()
120
-
121
- def _load_models(self):
122
- """Load AI models for content analysis"""
123
- if not HAS_TRANSFORMERS:
124
- logger.warning("⚠️ Transformers not available, using rule-based methods")
125
- return
126
-
127
- try:
128
- # Content quality assessment
129
- self.quality_scorer = pipeline(
130
- "text-classification",
131
- model="martin-ha/toxic-comment-model",
132
- return_all_scores=True
133
- )
134
- logger.info("✅ Quality assessment model loaded")
135
- except Exception as e:
136
- logger.warning(f"⚠️ Could not load quality model: {e}")
137
 
138
  def scrape_url(self, url: str) -> Optional[ScrapedItem]:
139
- """Scrape a single URL and return structured data"""
140
  try:
141
  # Validate URL
142
- if not self._is_valid_url(url):
143
- raise ValueError("Invalid URL provided")
 
144
 
145
  # Fetch content
146
  response = self.session.get(url, timeout=15)
@@ -149,12 +148,12 @@ class WebScraperEngine:
149
  # Parse HTML
150
  soup = BeautifulSoup(response.content, 'html.parser')
151
 
152
- # Extract structured data
153
  title = self._extract_title(soup)
154
  content = self._extract_content(soup)
155
  metadata = self._extract_metadata(soup, response)
156
 
157
- # Create scraped item
158
  item = ScrapedItem(
159
  id=str(uuid.uuid4()),
160
  url=url,
@@ -173,7 +172,7 @@ class WebScraperEngine:
173
  return None
174
 
175
  def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
176
- """Scrape multiple URLs with progress tracking"""
177
  results = []
178
  total = len(urls)
179
 
@@ -185,54 +184,32 @@ class WebScraperEngine:
185
  if item:
186
  results.append(item)
187
 
188
- # Rate limiting
189
- time.sleep(1)
190
 
191
  return results
192
 
193
- def _is_valid_url(self, url: str) -> bool:
194
- """Validate URL format and safety"""
195
- try:
196
- parsed = urlparse(url)
197
- return parsed.scheme in ['http', 'https'] and parsed.netloc
198
- except:
199
- return False
200
-
201
  def _extract_title(self, soup: BeautifulSoup) -> str:
202
  """Extract page title"""
203
- # Try multiple selectors
204
- selectors = [
205
- 'meta[property="og:title"]',
206
- 'meta[name="twitter:title"]',
207
- 'title',
208
- 'h1'
209
- ]
210
 
211
- for selector in selectors:
212
- element = soup.select_one(selector)
213
- if element:
214
- if element.name == 'meta':
215
- return element.get('content', '').strip()
216
- else:
217
- return element.get_text().strip()
218
 
219
  return "Untitled"
220
 
221
  def _extract_content(self, soup: BeautifulSoup) -> str:
222
- """Extract main content using multiple strategies"""
223
  # Remove unwanted elements
224
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
225
  element.decompose()
226
 
227
- # Try content-specific selectors
228
  content_selectors = [
229
- 'article',
230
- 'main',
231
- '.content',
232
- '.post-content',
233
- '.entry-content',
234
- '.article-body',
235
- '[role="main"]'
236
  ]
237
 
238
  for selector in content_selectors:
@@ -250,18 +227,16 @@ class WebScraperEngine:
250
  return self._clean_text(soup.get_text(separator=' ', strip=True))
251
 
252
  def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
253
- """Extract metadata from page"""
254
  metadata = {
255
  'domain': urlparse(response.url).netloc,
256
  'status_code': response.status_code,
257
- 'content_type': response.headers.get('content-type', ''),
258
  'extracted_at': datetime.now().isoformat()
259
  }
260
 
261
  # Extract meta tags
262
- meta_tags = ['description', 'keywords', 'author', 'published_time']
263
- for tag in meta_tags:
264
- element = soup.find('meta', attrs={'name': tag}) or soup.find('meta', attrs={'property': f'article:{tag}'})
265
  if element:
266
  metadata[tag] = element.get('content', '')
267
 
@@ -269,155 +244,97 @@ class WebScraperEngine:
269
 
270
  def _clean_text(self, text: str) -> str:
271
  """Clean extracted text"""
272
- # Remove extra whitespace
273
  text = re.sub(r'\s+', ' ', text)
274
-
275
- # Remove common patterns
276
- patterns = [
277
- r'Subscribe.*?newsletter',
278
- r'Click here.*?more',
279
- r'Advertisement',
280
- r'Share this.*?social',
281
- r'Follow us on.*?media'
282
- ]
283
-
284
- for pattern in patterns:
285
- text = re.sub(pattern, '', text, flags=re.IGNORECASE)
286
-
287
  return text.strip()
288
 
289
  def _assess_quality(self, content: str) -> float:
290
- """Assess content quality (0-1 score)"""
291
  if not content:
292
  return 0.0
293
 
294
  score = 0.0
295
-
296
- # Length check
297
  word_count = len(content.split())
 
298
  if word_count >= 50:
299
- score += 0.3
300
  elif word_count >= 20:
301
- score += 0.1
302
 
303
- # Structure check (sentences)
304
  sentence_count = len(re.split(r'[.!?]+', content))
305
  if sentence_count >= 3:
306
- score += 0.2
307
-
308
- # Language quality (basic)
309
- if re.search(r'[A-Z][a-z]+', content): # Proper capitalization
310
- score += 0.2
311
-
312
- if not re.search(r'[^\w\s]', content[:100]): # No weird characters at start
313
- score += 0.1
314
 
315
- # Readability (simple check)
316
- avg_word_length = np.mean([len(word) for word in content.split()])
317
- if 3 <= avg_word_length <= 8:
318
- score += 0.2
319
 
320
  return min(score, 1.0)
321
 
322
  class DataProcessor:
323
- """Advanced data processing and cleaning pipeline"""
324
 
325
  def __init__(self):
326
- self.language_detector = None
327
  self.sentiment_analyzer = None
328
  self.ner_model = None
329
  self._load_models()
330
 
331
  def _load_models(self):
332
- """Load NLP models for processing"""
333
  if not HAS_TRANSFORMERS:
 
334
  return
335
 
336
  try:
337
- # Sentiment analysis
338
  self.sentiment_analyzer = pipeline(
339
  "sentiment-analysis",
340
  model="cardiffnlp/twitter-roberta-base-sentiment-latest"
341
  )
342
-
343
- # Named Entity Recognition
344
- self.ner_model = pipeline(
345
- "ner",
346
- model="dbmdz/bert-large-cased-finetuned-conll03-english",
347
- aggregation_strategy="simple"
348
- )
349
-
350
- logger.info("✅ NLP models loaded successfully")
351
  except Exception as e:
352
- logger.warning(f"⚠️ Could not load NLP models: {e}")
353
 
354
- def process_items(self, items: List[ScrapedItem], processing_options: Dict[str, bool]) -> List[ScrapedItem]:
355
- """Process scraped items with various enhancement options"""
356
- processed_items = []
357
 
358
  for item in items:
359
- processed_item = self._process_single_item(item, processing_options)
360
- if processed_item:
361
- processed_items.append(processed_item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
- return processed_items
364
-
365
- def _process_single_item(self, item: ScrapedItem, options: Dict[str, bool]) -> Optional[ScrapedItem]:
366
- """Process a single item"""
367
- try:
368
- # Clean content
369
- if options.get('clean_text', True):
370
- item.content = self._clean_text_advanced(item.content)
371
-
372
- # Filter by quality
373
- if options.get('quality_filter', True) and item.quality_score < 0.3:
374
- return None
375
-
376
- # Add sentiment analysis
377
- if options.get('add_sentiment', False) and self.sentiment_analyzer:
378
- sentiment = self._analyze_sentiment(item.content)
379
- item.metadata['sentiment'] = sentiment
380
-
381
- # Add named entities
382
- if options.get('extract_entities', False) and self.ner_model:
383
- entities = self._extract_entities(item.content)
384
- item.metadata['entities'] = entities
385
-
386
- # Add language detection
387
- if options.get('detect_language', True):
388
- item.language = self._detect_language(item.content)
389
-
390
- return item
391
-
392
- except Exception as e:
393
- logger.error(f"Error processing item {item.id}: {e}")
394
- return None
395
 
396
  def _clean_text_advanced(self, text: str) -> str:
397
  """Advanced text cleaning"""
398
- # Remove URLs
399
  text = re.sub(r'http\S+|www\.\S+', '', text)
400
-
401
- # Remove email addresses
402
  text = re.sub(r'\S+@\S+', '', text)
403
-
404
- # Remove excessive punctuation
405
- text = re.sub(r'[!?]{2,}', '!', text)
406
- text = re.sub(r'\.{3,}', '...', text)
407
-
408
- # Normalize whitespace
409
  text = re.sub(r'\s+', ' ', text)
410
-
411
- # Remove very short paragraphs (likely navigation)
412
- paragraphs = text.split('\n')
413
- paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 20]
414
-
415
- return '\n'.join(paragraphs).strip()
416
 
417
  def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
418
- """Analyze sentiment of text"""
419
  try:
420
- # Truncate text for model limits
421
  text_sample = text[:512]
422
  result = self.sentiment_analyzer(text_sample)[0]
423
  return {
@@ -427,80 +344,56 @@ class DataProcessor:
427
  except:
428
  return {'label': 'UNKNOWN', 'score': 0.0}
429
 
430
- def _extract_entities(self, text: str) -> List[Dict[str, Any]]:
431
- """Extract named entities"""
432
- try:
433
- # Truncate text for model limits
434
- text_sample = text[:512]
435
- entities = self.ner_model(text_sample)
436
- return [
437
- {
438
- 'text': ent['word'],
439
- 'label': ent['entity_group'],
440
- 'confidence': ent['score']
441
- }
442
- for ent in entities
443
- ]
444
- except:
445
- return []
446
-
447
  def _detect_language(self, text: str) -> str:
448
  """Simple language detection"""
449
- # Basic heuristic - could be enhanced with proper language detection
450
  if re.search(r'[а-яё]', text.lower()):
451
  return 'ru'
452
  elif re.search(r'[ñáéíóúü]', text.lower()):
453
  return 'es'
454
- elif re.search(r'[àâäçéèêëïîôöùûüÿ]', text.lower()):
455
- return 'fr'
456
- else:
457
- return 'en'
458
 
459
  class AnnotationEngine:
460
- """Interactive annotation tools for dataset creation"""
461
 
462
  def __init__(self):
463
  self.templates = self._load_templates()
464
 
465
  def _load_templates(self) -> Dict[str, DatasetTemplate]:
466
- """Load predefined dataset templates"""
467
  templates = {
468
  'text_classification': DatasetTemplate(
469
  name="Text Classification",
470
- description="Classify text into predefined categories",
471
  task_type="classification",
472
  required_fields=["text", "label"],
473
  optional_fields=["confidence", "metadata"],
474
  example_format={"text": "Sample text", "label": "positive"},
475
- instructions="Label each text with the appropriate category"
476
  ),
477
  'sentiment_analysis': DatasetTemplate(
478
  name="Sentiment Analysis",
479
- description="Analyze emotional tone of text",
480
  task_type="classification",
481
  required_fields=["text", "sentiment"],
482
  optional_fields=["confidence", "aspects"],
483
  example_format={"text": "I love this!", "sentiment": "positive"},
484
- instructions="Classify the sentiment as positive, negative, or neutral"
485
  ),
486
  'named_entity_recognition': DatasetTemplate(
487
  name="Named Entity Recognition",
488
- description="Identify and classify named entities in text",
489
  task_type="ner",
490
  required_fields=["text", "entities"],
491
  optional_fields=["metadata"],
492
  example_format={
493
- "text": "John works at OpenAI in San Francisco",
494
- "entities": [
495
- {"text": "John", "label": "PERSON", "start": 0, "end": 4},
496
- {"text": "OpenAI", "label": "ORG", "start": 14, "end": 20}
497
- ]
498
  },
499
- instructions="Mark all named entities (people, organizations, locations, etc.)"
500
  ),
501
  'question_answering': DatasetTemplate(
502
  name="Question Answering",
503
- description="Create question-answer pairs from text",
504
  task_type="qa",
505
  required_fields=["context", "question", "answer"],
506
  optional_fields=["answer_start", "metadata"],
@@ -509,77 +402,45 @@ class AnnotationEngine:
509
  "question": "What is the capital of France?",
510
  "answer": "Paris"
511
  },
512
- instructions="Create meaningful questions and provide accurate answers"
513
  ),
514
  'summarization': DatasetTemplate(
515
  name="Text Summarization",
516
- description="Create concise summaries of longer texts",
517
  task_type="summarization",
518
  required_fields=["text", "summary"],
519
  optional_fields=["summary_type", "length"],
520
  example_format={
521
  "text": "Long article text...",
522
- "summary": "Brief summary of the main points"
523
  },
524
- instructions="Write clear, concise summaries capturing key information"
525
  )
526
  }
527
  return templates
528
-
529
- def create_annotation_interface(self, template_name: str, items: List[ScrapedItem]) -> Dict[str, Any]:
530
- """Create annotation interface for specific template"""
531
- template = self.templates.get(template_name)
532
- if not template:
533
- raise ValueError(f"Unknown template: {template_name}")
534
-
535
- # Prepare data for annotation
536
- annotation_data = []
537
- for item in items:
538
- annotation_data.append({
539
- 'id': item.id,
540
- 'text': item.content[:1000], # Truncate for UI
541
- 'title': item.title,
542
- 'url': item.url,
543
- 'annotations': {}
544
- })
545
-
546
- return {
547
- 'template': template,
548
- 'data': annotation_data,
549
- 'progress': 0,
550
- 'completed': 0
551
- }
552
 
553
  class DatasetExporter:
554
- """Export datasets in various formats for ML frameworks"""
555
 
556
  def __init__(self):
557
  self.supported_formats = [
558
- 'huggingface_datasets',
559
- 'json',
560
- 'csv',
561
- 'parquet',
562
- 'jsonl',
563
- 'pytorch',
564
- 'tensorflow'
565
  ]
566
 
567
  def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
568
  export_format: str, annotations: Dict[str, Any] = None) -> str:
569
- """Export annotated dataset in specified format"""
570
  try:
571
- # Prepare dataset
572
- dataset_data = self._prepare_dataset_data(items, template, annotations)
573
 
574
- # Export based on format
575
- if export_format == 'huggingface_datasets':
576
- return self._export_huggingface(dataset_data, template)
577
- elif export_format == 'json':
578
  return self._export_json(dataset_data)
579
  elif export_format == 'csv':
580
  return self._export_csv(dataset_data)
581
  elif export_format == 'jsonl':
582
  return self._export_jsonl(dataset_data)
 
 
583
  else:
584
  raise ValueError(f"Unsupported format: {export_format}")
585
 
@@ -587,13 +448,12 @@ class DatasetExporter:
587
  logger.error(f"Export failed: {e}")
588
  raise
589
 
590
- def _prepare_dataset_data(self, items: List[ScrapedItem], template: DatasetTemplate,
591
- annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
592
- """Prepare data according to template format"""
593
  dataset_data = []
594
 
595
  for item in items:
596
- # Base data from scraped item
597
  data_point = {
598
  'text': item.content,
599
  'title': item.title,
@@ -601,312 +461,240 @@ class DatasetExporter:
601
  'metadata': item.metadata
602
  }
603
 
604
- # Add annotations if available
605
  if annotations and item.id in annotations:
606
- item_annotations = annotations[item.id]
607
- data_point.update(item_annotations)
608
 
609
- # Format according to template
610
- formatted_point = self._format_for_template(data_point, template)
611
- if formatted_point:
612
- dataset_data.append(formatted_point)
613
 
614
  return dataset_data
615
 
616
  def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
617
- """Format data point according to template requirements"""
618
  formatted = {}
619
 
620
- # Ensure required fields are present
621
  for field in template.required_fields:
622
  if field in data_point:
623
  formatted[field] = data_point[field]
624
  elif field == 'text' and 'content' in data_point:
625
  formatted[field] = data_point['content']
626
  else:
627
- # Skip this data point if required field is missing
628
  return None
629
 
630
- # Add optional fields if present
631
  for field in template.optional_fields:
632
  if field in data_point:
633
  formatted[field] = data_point[field]
634
 
635
  return formatted
636
 
637
- def _export_huggingface(self, dataset_data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
638
- """Export as HuggingFace Dataset"""
639
- if not HAS_DATASETS:
640
- raise ImportError("datasets library not available")
641
-
642
- try:
643
- # Create dataset
644
- dataset = Dataset.from_list(dataset_data)
645
-
646
- # Create dataset card
647
- card_content = f"""
648
- # {template.name} Dataset
649
-
650
- ## Description
651
- {template.description}
652
-
653
- ## Task Type
654
- {template.task_type}
655
-
656
- ## Format
657
- {template.example_format}
658
-
659
- ## Instructions
660
- {template.instructions}
661
-
662
- ## Statistics
663
- - Total samples: {len(dataset_data)}
664
- - Created: {datetime.now().isoformat()}
665
-
666
- ## Usage
667
- ```python
668
- from datasets import load_dataset
669
- dataset = load_dataset('path/to/dataset')
670
- ```
671
- """
672
-
673
- # Save dataset
674
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
675
- dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
676
-
677
- # Save locally (would push to Hub in production)
678
- dataset.save_to_disk(dataset_name)
679
-
680
- # Create info file
681
- with open(f"{dataset_name}/README.md", "w") as f:
682
- f.write(card_content)
683
-
684
- return dataset_name
685
-
686
- except Exception as e:
687
- logger.error(f"HuggingFace export failed: {e}")
688
- raise
689
-
690
- def _export_json(self, dataset_data: List[Dict[str, Any]]) -> str:
691
- """Export as JSON file"""
692
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
693
  filename = f"dataset_{timestamp}.json"
694
 
695
  with open(filename, 'w', encoding='utf-8') as f:
696
- json.dump(dataset_data, f, indent=2, ensure_ascii=False)
697
 
698
  return filename
699
 
700
- def _export_csv(self, dataset_data: List[Dict[str, Any]]) -> str:
701
- """Export as CSV file"""
702
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
703
  filename = f"dataset_{timestamp}.csv"
704
 
705
- df = pd.DataFrame(dataset_data)
706
  df.to_csv(filename, index=False)
707
 
708
  return filename
709
 
710
- def _export_jsonl(self, dataset_data: List[Dict[str, Any]]) -> str:
711
- """Export as JSONL file"""
712
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
713
  filename = f"dataset_{timestamp}.jsonl"
714
 
715
  with open(filename, 'w', encoding='utf-8') as f:
716
- for item in dataset_data:
717
  f.write(json.dumps(item, ensure_ascii=False) + '\n')
718
 
719
  return filename
720
-
721
- def create_modern_interface():
722
- """Create modern, intuitive interface for AI Dataset Studio"""
723
-
724
- # Initialize the studio
725
- studio = DatasetStudio()
726
-
727
- # Custom CSS for modern appearance
728
- custom_css = """
729
- .gradio-container {
730
- max-width: 1400px;
731
- margin: auto;
732
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
733
- }
734
-
735
- .studio-header {
736
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
737
- color: white;
738
- padding: 2rem;
739
- border-radius: 15px;
740
- margin-bottom: 2rem;
741
- text-align: center;
742
- box-shadow: 0 8px 32px rgba(0,0,0,0.1);
743
- }
744
 
745
- .workflow-card {
746
- background: #f8f9ff;
747
- border: 2px solid #e1e5ff;
748
- border-radius: 12px;
749
- padding: 1.5rem;
750
- margin: 1rem 0;
751
- transition: all 0.3s ease;
752
- }
753
-
754
- .workflow-card:hover {
755
- border-color: #667eea;
756
- box-shadow: 0 4px 20px rgba(102, 126, 234, 0.1);
757
- }
758
-
759
- .step-header {
760
- display: flex;
761
- align-items: center;
762
- margin-bottom: 1rem;
763
- font-size: 1.2em;
764
- font-weight: 600;
765
- color: #4c51bf;
766
- }
767
-
768
- .step-number {
769
- background: #667eea;
770
- color: white;
771
- border-radius: 50%;
772
- width: 30px;
773
- height: 30px;
774
- display: flex;
775
- align-items: center;
776
- justify-content: center;
777
- margin-right: 1rem;
778
- font-weight: bold;
779
- }
780
-
781
- .feature-grid {
782
- display: grid;
783
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
784
- gap: 1rem;
785
- margin: 1rem 0;
786
- }
787
 
788
- .feature-item {
789
- background: white;
790
- border: 1px solid #e2e8f0;
791
- border-radius: 8px;
792
- padding: 1rem;
793
- text-align: center;
794
- }
 
 
 
 
 
 
795
 
796
- .stat-card {
797
- background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
798
- color: white;
799
- padding: 1rem;
800
- border-radius: 10px;
801
- text-align: center;
802
- margin: 0.5rem;
803
- }
 
 
 
 
 
 
 
804
 
805
- .progress-bar {
806
- background: #e2e8f0;
807
- border-radius: 10px;
808
- height: 8px;
809
- overflow: hidden;
810
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
811
 
812
- .progress-fill {
813
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
814
- height: 100%;
815
- transition: width 0.3s ease;
816
- }
 
 
 
 
 
817
 
818
- .template-card {
819
- border: 2px solid #e2e8f0;
820
- border-radius: 10px;
821
- padding: 1rem;
822
- margin: 0.5rem;
823
- cursor: pointer;
824
- transition: all 0.3s ease;
825
- }
 
 
 
 
 
 
 
826
 
827
- .template-card:hover {
828
- border-color: #667eea;
829
- transform: translateY(-2px);
830
- box-shadow: 0 4px 12px rgba(0,0,0,0.1);
831
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
832
 
833
- .template-selected {
834
- border-color: #667eea;
835
- background: #f7fafc;
836
- }
 
 
 
 
 
 
 
 
 
 
 
 
837
 
838
- .export-option {
839
- background: #f7fafc;
840
- border: 1px solid #e2e8f0;
841
- border-radius: 8px;
842
- padding: 1rem;
843
- margin: 0.5rem 0;
844
- cursor: pointer;
845
- }
846
 
847
- .export-option:hover {
848
- background: #edf2f7;
849
- border-color: #cbd5e0;
 
 
 
 
850
  }
851
-
852
- .success-message {
853
- background: #f0fff4;
854
- border: 1px solid #9ae6b4;
855
- color: #276749;
856
- padding: 1rem;
857
- border-radius: 8px;
858
- margin: 1rem 0;
859
  }
860
-
861
- .error-message {
862
- background: #fed7d7;
863
- border: 1px solid #feb2b2;
864
- color: #c53030;
865
- padding: 1rem;
866
- border-radius: 8px;
867
- margin: 1rem 0;
868
  }
869
  """
870
 
871
- # Project state for UI
872
  project_state = gr.State({})
873
 
874
- with gr.Blocks(css=custom_css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
875
 
876
  # Header
877
  gr.HTML("""
878
  <div class="studio-header">
879
  <h1>🚀 AI Dataset Studio</h1>
880
- <p>Create high-quality training datasets without coding - Your personal Scale AI</p>
881
- <p style="opacity: 0.9; font-size: 0.9em;">Web Scraping → Data Processing → Annotation → ML-Ready Datasets</p>
882
  </div>
883
  """)
884
 
885
- # Main workflow tabs
886
  with gr.Tabs() as main_tabs:
887
 
888
- # Tab 1: Project Setup
889
- with gr.Tab("🎯 Project Setup", id="setup"):
890
- gr.HTML('<div class="step-header"><div class="step-number">1</div>Start Your Dataset Project</div>')
891
 
892
  with gr.Row():
893
  with gr.Column(scale=2):
894
- gr.HTML("""
895
- <div class="workflow-card">
896
- <h3>📋 Project Configuration</h3>
897
- <p>Define your dataset project and choose the type of AI task you're building for.</p>
898
- </div>
899
- """)
900
-
901
  project_name = gr.Textbox(
902
  label="Project Name",
903
- placeholder="e.g., 'News Sentiment Analysis' or 'Product Review Classification'",
904
- value="My Dataset Project"
905
  )
906
 
907
- # Template selection with visual cards
908
- gr.HTML("<h4>🎨 Choose Your Dataset Template</h4>")
909
-
910
  template_choice = gr.Radio(
911
  choices=[
912
  ("📊 Text Classification", "text_classification"),
@@ -916,192 +704,97 @@ def create_modern_interface():
916
  ("📝 Text Summarization", "summarization")
917
  ],
918
  label="Dataset Type",
919
- value="text_classification",
920
- interactive=True
921
- )
922
-
923
- create_project_btn = gr.Button(
924
- "🚀 Create Project",
925
- variant="primary",
926
- size="lg"
927
  )
928
 
 
929
  project_status = gr.Markdown("")
930
 
931
  with gr.Column(scale=1):
932
  gr.HTML("""
933
  <div class="workflow-card">
934
  <h3>💡 Template Guide</h3>
935
- <div class="feature-grid">
936
- <div class="feature-item">
937
- <h4>📊 Text Classification</h4>
938
- <p>Categorize text into predefined labels</p>
939
- <small>Great for: Spam detection, topic classification</small>
940
- </div>
941
- <div class="feature-item">
942
- <h4>😊 Sentiment Analysis</h4>
943
- <p>Analyze emotional tone and opinions</p>
944
- <small>Great for: Review analysis, social media monitoring</small>
945
- </div>
946
- <div class="feature-item">
947
- <h4>👥 Named Entity Recognition</h4>
948
- <p>Identify people, places, organizations</p>
949
- <small>Great for: Information extraction, content tagging</small>
950
- </div>
951
- </div>
952
  </div>
953
  """)
954
 
955
- # Tab 2: Data Collection
956
- with gr.Tab("🕷️ Data Collection", id="collection"):
957
- gr.HTML('<div class="step-header"><div class="step-number">2</div>Collect Your Data</div>')
958
 
959
  with gr.Row():
960
  with gr.Column(scale=2):
961
- gr.HTML("""
962
- <div class="workflow-card">
963
- <h3>🌐 Web Scraping</h3>
964
- <p>Provide URLs to scrape content automatically. Our AI will extract clean, structured text.</p>
965
- </div>
966
- """)
967
-
968
- # URL input methods
969
- with gr.Tabs():
970
- with gr.Tab("📝 Manual Input"):
971
- urls_input = gr.Textbox(
972
- label="URLs to Scrape",
973
- placeholder="https://example.com/article1\nhttps://example.com/article2\n...",
974
- lines=8,
975
- info="Enter one URL per line"
976
- )
977
-
978
- with gr.Tab("📎 File Upload"):
979
- urls_file = gr.File(
980
- label="Upload URL List",
981
- file_types=[".txt", ".csv"],
982
- info="Upload a text file with URLs (one per line) or CSV with 'url' column"
983
- )
984
-
985
- scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg")
986
 
987
- # Progress tracking
988
- scraping_progress = gr.Progress()
989
  scraping_status = gr.Markdown("")
990
 
991
  with gr.Column(scale=1):
992
- gr.HTML("""
993
- <div class="workflow-card">
994
- <h3>⚡ Features</h3>
995
- <ul style="list-style: none; padding: 0;">
996
- <li>✅ Smart content extraction</li>
997
- <li>✅ Quality scoring</li>
998
- <li>✅ Duplicate detection</li>
999
- <li>✅ Security validation</li>
1000
- <li>✅ Metadata extraction</li>
1001
- <li>✅ Rate limiting</li>
1002
- </ul>
1003
- </div>
1004
- """)
1005
-
1006
- # Quick stats
1007
  collection_stats = gr.HTML("")
1008
 
1009
- # Tab 3: Data Processing
1010
- with gr.Tab("⚙️ Data Processing", id="processing"):
1011
- gr.HTML('<div class="step-header"><div class="step-number">3</div>Clean & Enhance Your Data</div>')
1012
 
1013
  with gr.Row():
1014
  with gr.Column(scale=2):
1015
- gr.HTML("""
1016
- <div class="workflow-card">
1017
- <h3>🔧 Processing Options</h3>
1018
- <p>Configure how to clean and enhance your scraped data with AI-powered analysis.</p>
1019
- </div>
1020
- """)
1021
-
1022
- # Processing options
1023
  with gr.Row():
1024
  with gr.Column():
1025
- clean_text = gr.Checkbox(label="🧹 Advanced Text Cleaning", value=True)
1026
- quality_filter = gr.Checkbox(label="🎯 Quality Filtering", value=True)
1027
  detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
1028
 
1029
  with gr.Column():
1030
  add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
1031
  extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)
1032
- deduplicate = gr.Checkbox(label="🔄 Remove Duplicates", value=True)
1033
 
1034
- process_btn = gr.Button("⚙️ Process Data", variant="primary", size="lg")
1035
  processing_status = gr.Markdown("")
1036
 
1037
  with gr.Column(scale=1):
1038
- gr.HTML("""
1039
- <div class="workflow-card">
1040
- <h3>📊 Processing Stats</h3>
1041
- <div id="processing-stats"></div>
1042
- </div>
1043
- """)
1044
-
1045
  processing_stats = gr.HTML("")
1046
 
1047
- # Tab 4: Data Preview
1048
- with gr.Tab("👀 Data Preview", id="preview"):
1049
- gr.HTML('<div class="step-header"><div class="step-number">4</div>Review Your Dataset</div>')
1050
 
1051
  with gr.Row():
1052
  with gr.Column(scale=2):
1053
- gr.HTML("""
1054
- <div class="workflow-card">
1055
- <h3>📋 Dataset Preview</h3>
1056
- <p>Review your processed data before annotation or export.</p>
1057
- </div>
1058
- """)
1059
-
1060
- refresh_preview_btn = gr.Button("🔄 Refresh Preview", variant="secondary")
1061
 
1062
- # Data preview table
1063
  data_preview = gr.DataFrame(
1064
- headers=["Title", "Content Preview", "Word Count", "Quality Score", "URL"],
1065
- label="Dataset Preview",
1066
- interactive=False
1067
  )
1068
 
1069
  with gr.Column(scale=1):
1070
- gr.HTML("""
1071
- <div class="workflow-card">
1072
- <h3>📈 Dataset Statistics</h3>
1073
- </div>
1074
- """)
1075
-
1076
  dataset_stats = gr.JSON(label="Statistics")
1077
 
1078
- # Tab 5: Export
1079
- with gr.Tab("📤 Export Dataset", id="export"):
1080
- gr.HTML('<div class="step-header"><div class="step-number">5</div>Export Your Dataset</div>')
1081
 
1082
  with gr.Row():
1083
  with gr.Column(scale=2):
1084
- gr.HTML("""
1085
- <div class="workflow-card">
1086
- <h3>💾 Export Options</h3>
1087
- <p>Export your dataset in various formats for different ML frameworks and platforms.</p>
1088
- </div>
1089
- """)
1090
-
1091
- # Export format selection
1092
  export_format = gr.Radio(
1093
  choices=[
1094
- ("🤗 HuggingFace Datasets", "huggingface_datasets"),
1095
  ("📄 JSON", "json"),
1096
  ("📊 CSV", "csv"),
1097
  ("📋 JSONL", "jsonl"),
1098
- (" Parquet", "parquet")
1099
  ],
1100
  label="Export Format",
1101
  value="json"
1102
  )
1103
 
1104
- # Template for export
1105
  export_template = gr.Dropdown(
1106
  choices=[
1107
  "text_classification",
@@ -1110,162 +803,126 @@ def create_modern_interface():
1110
  "question_answering",
1111
  "summarization"
1112
  ],
1113
- label="Dataset Template",
1114
  value="text_classification"
1115
  )
1116
 
1117
- export_btn = gr.Button("📤 Export Dataset", variant="primary", size="lg")
1118
-
1119
- # Export results
1120
  export_status = gr.Markdown("")
1121
- export_file = gr.File(label="Download Dataset", visible=False)
1122
 
1123
  with gr.Column(scale=1):
1124
  gr.HTML("""
1125
  <div class="workflow-card">
1126
- <h3>📋 Export Formats</h3>
1127
- <div class="feature-item">
1128
- <h4>🤗 HuggingFace</h4>
1129
- <p>Ready for transformers library</p>
1130
- </div>
1131
- <div class="feature-item">
1132
- <h4>📄 JSON/JSONL</h4>
1133
- <p>Universal format for any framework</p>
1134
- </div>
1135
- <div class="feature-item">
1136
- <h4>📊 CSV</h4>
1137
- <p>Easy analysis in Excel/Pandas</p>
1138
- </div>
1139
  </div>
1140
  """)
1141
 
1142
  # Event handlers
1143
  def create_project(name, template):
1144
- """Create new project"""
1145
  if not name.strip():
1146
  return "❌ Please enter a project name", {}
1147
 
1148
  project = studio.start_new_project(name.strip(), template)
1149
  status = f"""
1150
- ✅ **Project Created Successfully!**
1151
 
1152
- **Project:** {project['name']}
1153
  **Type:** {template.replace('_', ' ').title()}
1154
- **ID:** {project['id'][:8]}...
1155
- **Created:** {project['created_at'][:19]}
1156
 
1157
- 👉 **Next Step:** Go to the Data Collection tab to start scraping URLs
1158
  """
1159
  return status, project
1160
 
1161
- def scrape_urls_handler(urls_text, urls_file, project, progress=gr.Progress()):
1162
- """Handle URL scraping"""
1163
  if not project:
1164
- return "❌ Please create a project first", ""
1165
-
1166
- # Process URLs from text input or file
1167
- urls = []
1168
- if urls_text:
1169
- urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
1170
- elif urls_file:
1171
- # Handle file upload (simplified)
1172
- try:
1173
- content = urls_file.read().decode('utf-8')
1174
- urls = [url.strip() for url in content.split('\n') if url.strip()]
1175
- except:
1176
- return "❌ Error reading uploaded file", ""
1177
 
 
1178
  if not urls:
1179
  return "❌ No URLs provided", ""
1180
 
1181
- # Progress callback
1182
  def progress_callback(pct, msg):
1183
  progress(pct, desc=msg)
1184
 
1185
- # Scrape URLs
1186
- success_count, errors = studio.scrape_urls(urls, progress_callback)
1187
 
1188
- if success_count > 0:
1189
- stats_html = f"""
1190
- <div class="stat-card">
1191
  <h3>✅ Scraping Complete</h3>
1192
- <p><strong>{success_count}</strong> items collected</p>
1193
- <p><strong>{len(urls) - success_count}</strong> failed</p>
1194
  </div>
1195
  """
1196
 
1197
  status = f"""
1198
  ✅ **Scraping Complete!**
1199
 
1200
- **Successfully scraped:** {success_count} URLs
1201
- **Failed:** {len(urls) - success_count} URLs
1202
 
1203
- 👉 **Next Step:** Go to Data Processing tab to clean and enhance your data
1204
  """
1205
 
1206
- return status, stats_html
1207
  else:
1208
  return f"❌ Scraping failed: {', '.join(errors)}", ""
1209
 
1210
- def process_data_handler(clean_text, quality_filter, detect_language,
1211
- add_sentiment, extract_entities, deduplicate, project):
1212
- """Handle data processing"""
1213
  if not project:
1214
- return "❌ Please create a project first", ""
1215
 
1216
  if not studio.scraped_items:
1217
- return "❌ No scraped data to process. Please scrape URLs first.", ""
1218
 
1219
- # Configure processing options
1220
  options = {
1221
- 'clean_text': clean_text,
1222
- 'quality_filter': quality_filter,
1223
- 'detect_language': detect_language,
1224
- 'add_sentiment': add_sentiment,
1225
- 'extract_entities': extract_entities,
1226
- 'deduplicate': deduplicate
1227
  }
1228
 
1229
- # Process data
1230
- processed_count = studio.process_data(options)
1231
 
1232
- if processed_count > 0:
1233
  stats = studio.get_data_statistics()
1234
  stats_html = f"""
1235
- <div class="stat-card">
1236
  <h3>⚙️ Processing Complete</h3>
1237
- <p><strong>{processed_count}</strong> items processed</p>
1238
- <p>Avg Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
1239
- <p>Avg Words: <strong>{stats.get('avg_word_count', 0)}</strong></p>
1240
  </div>
1241
  """
1242
 
1243
  status = f"""
1244
  ✅ **Processing Complete!**
1245
 
1246
- **Processed items:** {processed_count}
1247
- **Average quality score:** {stats.get('avg_quality_score', 0)}
1248
- **Average word count:** {stats.get('avg_word_count', 0)}
1249
 
1250
- 👉 **Next Step:** Check the Data Preview tab to review your dataset
1251
  """
1252
 
1253
  return status, stats_html
1254
  else:
1255
- return "❌ No items passed processing filters", ""
1256
 
1257
  def refresh_preview_handler(project):
1258
- """Refresh data preview"""
1259
  if not project:
1260
  return None, {}
1261
 
1262
- preview_data = studio.get_data_preview()
1263
  stats = studio.get_data_statistics()
1264
 
1265
- if preview_data:
1266
- # Convert to DataFrame format
1267
  df_data = []
1268
- for item in preview_data:
1269
  df_data.append([
1270
  item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
1271
  item['content_preview'],
@@ -1278,26 +935,23 @@ def create_modern_interface():
1278
 
1279
  return None, {}
1280
 
1281
- def export_dataset_handler(export_format, export_template, project):
1282
- """Handle dataset export"""
1283
  if not project:
1284
- return "❌ Please create a project first", None
1285
 
1286
  if not studio.processed_items and not studio.scraped_items:
1287
- return "❌ No data to export. Please scrape and process data first.", None
1288
 
1289
  try:
1290
- # Export dataset
1291
- filename = studio.export_dataset(export_template, export_format)
1292
 
1293
  status = f"""
1294
  ✅ **Export Successful!**
1295
 
1296
- **Format:** {export_format}
1297
- **Template:** {export_template.replace('_', ' ').title()}
1298
  **File:** {filename}
1299
 
1300
- 📥 **Download your dataset using the link below**
1301
  """
1302
 
1303
  return status, filename
@@ -1305,7 +959,7 @@ def create_modern_interface():
1305
  except Exception as e:
1306
  return f"❌ Export failed: {str(e)}", None
1307
 
1308
- # Connect event handlers
1309
  create_project_btn.click(
1310
  fn=create_project,
1311
  inputs=[project_name, template_choice],
@@ -1314,43 +968,36 @@ def create_modern_interface():
1314
 
1315
  scrape_btn.click(
1316
  fn=scrape_urls_handler,
1317
- inputs=[urls_input, urls_file, project_state],
1318
  outputs=[scraping_status, collection_stats]
1319
  )
1320
 
1321
  process_btn.click(
1322
  fn=process_data_handler,
1323
  inputs=[clean_text, quality_filter, detect_language,
1324
- add_sentiment, extract_entities, deduplicate, project_state],
1325
  outputs=[processing_status, processing_stats]
1326
  )
1327
 
1328
- refresh_preview_btn.click(
1329
  fn=refresh_preview_handler,
1330
  inputs=[project_state],
1331
  outputs=[data_preview, dataset_stats]
1332
  )
1333
 
1334
  export_btn.click(
1335
- fn=export_dataset_handler,
1336
  inputs=[export_format, export_template, project_state],
1337
  outputs=[export_status, export_file]
1338
  )
1339
-
1340
- # Auto-refresh preview when processing completes
1341
- processing_status.change(
1342
- fn=refresh_preview_handler,
1343
- inputs=[project_state],
1344
- outputs=[data_preview, dataset_stats]
1345
- )
1346
 
1347
  return interface
1348
 
1349
- # Launch the application
1350
  if __name__ == "__main__":
1351
  logger.info("🚀 Starting AI Dataset Studio...")
1352
 
1353
- # Check available features
1354
  features = []
1355
  if HAS_TRANSFORMERS:
1356
  features.append("✅ AI Models")
@@ -1365,11 +1012,15 @@ if __name__ == "__main__":
1365
  if HAS_DATASETS:
1366
  features.append("✅ HuggingFace Integration")
1367
  else:
1368
- features.append("⚠️ Standard Export Only")
1369
 
1370
  logger.info(f"📊 Features: {' | '.join(features)}")
1371
 
1372
  try:
 
 
 
 
1373
  interface = create_modern_interface()
1374
  logger.info("✅ Interface created successfully")
1375
 
@@ -1377,10 +1028,10 @@ if __name__ == "__main__":
1377
  server_name="0.0.0.0",
1378
  server_port=7860,
1379
  share=False,
1380
- show_error=True,
1381
- debug=False
1382
  )
1383
 
1384
  except Exception as e:
1385
- logger.error(f"❌ Failed to launch application: {e}")
 
1386
  raise
 
1
  """
2
+ AI Dataset Studio - Complete Application
3
+ Fixed version with all classes properly defined
 
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import gradio as gr
 
21
  import time
22
  from collections import defaultdict
23
  import io
 
24
 
25
  # Optional imports with fallbacks
26
  try:
27
  from transformers import pipeline, AutoTokenizer, AutoModel
 
28
  HAS_TRANSFORMERS = True
29
  except ImportError:
30
  HAS_TRANSFORMERS = False
 
32
  try:
33
  import nltk
34
  from nltk.tokenize import sent_tokenize, word_tokenize
 
35
  HAS_NLTK = True
36
  except ImportError:
37
  HAS_NLTK = False
 
81
  """Template for dataset creation"""
82
  name: str
83
  description: str
84
+ task_type: str
85
  required_fields: List[str]
86
  optional_fields: List[str]
87
  example_format: Dict[str, Any]
88
  instructions: str
89
 
90
+ class SecurityValidator:
91
+ """Security validation for URLs and content"""
92
+
93
+ ALLOWED_SCHEMES = {'http', 'https'}
94
+ BLOCKED_DOMAINS = {
95
+ 'localhost', '127.0.0.1', '0.0.0.0',
96
+ '192.168.', '10.', '172.16.', '172.17.',
97
+ '172.18.', '172.19.', '172.20.', '172.21.',
98
+ '172.22.', '172.23.', '172.24.', '172.25.',
99
+ '172.26.', '172.27.', '172.28.', '172.29.',
100
+ '172.30.', '172.31.'
101
+ }
102
+
103
+ @classmethod
104
+ def validate_url(cls, url: str) -> Tuple[bool, str]:
105
+ """Validate URL for security concerns"""
106
+ try:
107
+ parsed = urlparse(url)
108
+
109
+ if parsed.scheme not in cls.ALLOWED_SCHEMES:
110
+ return False, f"Invalid scheme: {parsed.scheme}"
111
+
112
+ hostname = parsed.hostname or ''
113
+ if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
114
+ return False, "Access to internal networks not allowed"
115
+
116
+ if not parsed.netloc:
117
+ return False, "Invalid URL format"
118
+
119
+ return True, "URL is valid"
120
+
121
+ except Exception as e:
122
+ return False, f"URL validation error: {str(e)}"
123
+
124
  class WebScraperEngine:
125
+ """Advanced web scraping engine"""
126
 
127
  def __init__(self):
128
  self.session = requests.Session()
129
  self.session.headers.update({
130
+ 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
131
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
132
  'Accept-Language': 'en-US,en;q=0.5',
 
133
  'Connection': 'keep-alive',
134
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  def scrape_url(self, url: str) -> Optional[ScrapedItem]:
137
+ """Scrape a single URL"""
138
  try:
139
  # Validate URL
140
+ is_valid, validation_msg = SecurityValidator.validate_url(url)
141
+ if not is_valid:
142
+ raise ValueError(f"Security validation failed: {validation_msg}")
143
 
144
  # Fetch content
145
  response = self.session.get(url, timeout=15)
 
148
  # Parse HTML
149
  soup = BeautifulSoup(response.content, 'html.parser')
150
 
151
+ # Extract data
152
  title = self._extract_title(soup)
153
  content = self._extract_content(soup)
154
  metadata = self._extract_metadata(soup, response)
155
 
156
+ # Create item
157
  item = ScrapedItem(
158
  id=str(uuid.uuid4()),
159
  url=url,
 
172
  return None
173
 
174
  def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
175
+ """Scrape multiple URLs"""
176
  results = []
177
  total = len(urls)
178
 
 
184
  if item:
185
  results.append(item)
186
 
187
+ time.sleep(1) # Rate limiting
 
188
 
189
  return results
190
 
 
 
 
 
 
 
 
 
191
  def _extract_title(self, soup: BeautifulSoup) -> str:
192
  """Extract page title"""
193
+ title_tag = soup.find('title')
194
+ if title_tag:
195
+ return title_tag.get_text().strip()
 
 
 
 
196
 
197
+ h1_tag = soup.find('h1')
198
+ if h1_tag:
199
+ return h1_tag.get_text().strip()
 
 
 
 
200
 
201
  return "Untitled"
202
 
203
  def _extract_content(self, soup: BeautifulSoup) -> str:
204
+ """Extract main content"""
205
  # Remove unwanted elements
206
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
207
  element.decompose()
208
 
209
+ # Try content selectors
210
  content_selectors = [
211
+ 'article', 'main', '.content', '.post-content',
212
+ '.entry-content', '.article-body'
 
 
 
 
 
213
  ]
214
 
215
  for selector in content_selectors:
 
227
  return self._clean_text(soup.get_text(separator=' ', strip=True))
228
 
229
  def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
230
+ """Extract metadata"""
231
  metadata = {
232
  'domain': urlparse(response.url).netloc,
233
  'status_code': response.status_code,
 
234
  'extracted_at': datetime.now().isoformat()
235
  }
236
 
237
  # Extract meta tags
238
+ for tag in ['description', 'keywords', 'author']:
239
+ element = soup.find('meta', attrs={'name': tag})
 
240
  if element:
241
  metadata[tag] = element.get('content', '')
242
 
 
244
 
245
  def _clean_text(self, text: str) -> str:
246
  """Clean extracted text"""
 
247
  text = re.sub(r'\s+', ' ', text)
248
+ text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
249
+ text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
250
  return text.strip()
251
 
252
  def _assess_quality(self, content: str) -> float:
253
+ """Assess content quality"""
254
  if not content:
255
  return 0.0
256
 
257
  score = 0.0
 
 
258
  word_count = len(content.split())
259
+
260
  if word_count >= 50:
261
+ score += 0.4
262
  elif word_count >= 20:
263
+ score += 0.2
264
 
 
265
  sentence_count = len(re.split(r'[.!?]+', content))
266
  if sentence_count >= 3:
267
+ score += 0.3
 
 
 
 
 
 
 
268
 
269
+ if re.search(r'[A-Z][a-z]+', content):
270
+ score += 0.3
 
 
271
 
272
  return min(score, 1.0)
273
 
274
  class DataProcessor:
275
+ """Data processing pipeline"""
276
 
277
  def __init__(self):
 
278
  self.sentiment_analyzer = None
279
  self.ner_model = None
280
  self._load_models()
281
 
282
  def _load_models(self):
283
+ """Load NLP models"""
284
  if not HAS_TRANSFORMERS:
285
+ logger.warning("⚠️ Transformers not available")
286
  return
287
 
288
  try:
 
289
  self.sentiment_analyzer = pipeline(
290
  "sentiment-analysis",
291
  model="cardiffnlp/twitter-roberta-base-sentiment-latest"
292
  )
293
+ logger.info("✅ Sentiment model loaded")
 
 
 
 
 
 
 
 
294
  except Exception as e:
295
+ logger.warning(f"⚠️ Could not load sentiment model: {e}")
296
 
297
+ def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]:
298
+ """Process scraped items"""
299
+ processed = []
300
 
301
  for item in items:
302
+ try:
303
+ # Clean text
304
+ if options.get('clean_text', True):
305
+ item.content = self._clean_text_advanced(item.content)
306
+
307
+ # Quality filter
308
+ if options.get('quality_filter', True) and item.quality_score < 0.3:
309
+ continue
310
+
311
+ # Add sentiment
312
+ if options.get('add_sentiment', False) and self.sentiment_analyzer:
313
+ sentiment = self._analyze_sentiment(item.content)
314
+ item.metadata['sentiment'] = sentiment
315
+
316
+ # Language detection
317
+ if options.get('detect_language', True):
318
+ item.language = self._detect_language(item.content)
319
+
320
+ processed.append(item)
321
+
322
+ except Exception as e:
323
+ logger.error(f"Error processing item {item.id}: {e}")
324
+ continue
325
 
326
+ return processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  def _clean_text_advanced(self, text: str) -> str:
329
  """Advanced text cleaning"""
 
330
  text = re.sub(r'http\S+|www\.\S+', '', text)
 
 
331
  text = re.sub(r'\S+@\S+', '', text)
 
 
 
 
 
 
332
  text = re.sub(r'\s+', ' ', text)
333
+ return text.strip()
 
 
 
 
 
334
 
335
  def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
336
+ """Analyze sentiment"""
337
  try:
 
338
  text_sample = text[:512]
339
  result = self.sentiment_analyzer(text_sample)[0]
340
  return {
 
344
  except:
345
  return {'label': 'UNKNOWN', 'score': 0.0}
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  def _detect_language(self, text: str) -> str:
348
  """Simple language detection"""
 
349
  if re.search(r'[а-яё]', text.lower()):
350
  return 'ru'
351
  elif re.search(r'[ñáéíóúü]', text.lower()):
352
  return 'es'
353
+ return 'en'
 
 
 
354
 
355
  class AnnotationEngine:
356
+ """Annotation tools for dataset creation"""
357
 
358
  def __init__(self):
359
  self.templates = self._load_templates()
360
 
361
  def _load_templates(self) -> Dict[str, DatasetTemplate]:
362
+ """Load dataset templates"""
363
  templates = {
364
  'text_classification': DatasetTemplate(
365
  name="Text Classification",
366
+ description="Classify text into categories",
367
  task_type="classification",
368
  required_fields=["text", "label"],
369
  optional_fields=["confidence", "metadata"],
370
  example_format={"text": "Sample text", "label": "positive"},
371
+ instructions="Label each text with appropriate category"
372
  ),
373
  'sentiment_analysis': DatasetTemplate(
374
  name="Sentiment Analysis",
375
+ description="Analyze emotional tone",
376
  task_type="classification",
377
  required_fields=["text", "sentiment"],
378
  optional_fields=["confidence", "aspects"],
379
  example_format={"text": "I love this!", "sentiment": "positive"},
380
+ instructions="Classify sentiment as positive, negative, or neutral"
381
  ),
382
  'named_entity_recognition': DatasetTemplate(
383
  name="Named Entity Recognition",
384
+ description="Identify named entities",
385
  task_type="ner",
386
  required_fields=["text", "entities"],
387
  optional_fields=["metadata"],
388
  example_format={
389
+ "text": "John works at OpenAI",
390
+ "entities": [{"text": "John", "label": "PERSON"}]
 
 
 
391
  },
392
+ instructions="Mark all named entities"
393
  ),
394
  'question_answering': DatasetTemplate(
395
  name="Question Answering",
396
+ description="Create Q&A pairs",
397
  task_type="qa",
398
  required_fields=["context", "question", "answer"],
399
  optional_fields=["answer_start", "metadata"],
 
402
  "question": "What is the capital of France?",
403
  "answer": "Paris"
404
  },
405
+ instructions="Create meaningful questions and answers"
406
  ),
407
  'summarization': DatasetTemplate(
408
  name="Text Summarization",
409
+ description="Create summaries",
410
  task_type="summarization",
411
  required_fields=["text", "summary"],
412
  optional_fields=["summary_type", "length"],
413
  example_format={
414
  "text": "Long article text...",
415
+ "summary": "Brief summary"
416
  },
417
+ instructions="Write clear, concise summaries"
418
  )
419
  }
420
  return templates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  class DatasetExporter:
423
+ """Export datasets in various formats"""
424
 
425
  def __init__(self):
426
  self.supported_formats = [
427
+ 'json', 'csv', 'jsonl', 'huggingface_datasets'
 
 
 
 
 
 
428
  ]
429
 
430
  def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
431
  export_format: str, annotations: Dict[str, Any] = None) -> str:
432
+ """Export dataset"""
433
  try:
434
+ dataset_data = self._prepare_data(items, template, annotations)
 
435
 
436
+ if export_format == 'json':
 
 
 
437
  return self._export_json(dataset_data)
438
  elif export_format == 'csv':
439
  return self._export_csv(dataset_data)
440
  elif export_format == 'jsonl':
441
  return self._export_jsonl(dataset_data)
442
+ elif export_format == 'huggingface_datasets':
443
+ return self._export_huggingface(dataset_data, template)
444
  else:
445
  raise ValueError(f"Unsupported format: {export_format}")
446
 
 
448
  logger.error(f"Export failed: {e}")
449
  raise
450
 
451
+ def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate,
452
+ annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
453
+ """Prepare data according to template"""
454
  dataset_data = []
455
 
456
  for item in items:
 
457
  data_point = {
458
  'text': item.content,
459
  'title': item.title,
 
461
  'metadata': item.metadata
462
  }
463
 
 
464
  if annotations and item.id in annotations:
465
+ data_point.update(annotations[item.id])
 
466
 
467
+ formatted = self._format_for_template(data_point, template)
468
+ if formatted:
469
+ dataset_data.append(formatted)
 
470
 
471
  return dataset_data
472
 
473
  def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
474
+ """Format data according to template"""
475
  formatted = {}
476
 
 
477
  for field in template.required_fields:
478
  if field in data_point:
479
  formatted[field] = data_point[field]
480
  elif field == 'text' and 'content' in data_point:
481
  formatted[field] = data_point['content']
482
  else:
 
483
  return None
484
 
 
485
  for field in template.optional_fields:
486
  if field in data_point:
487
  formatted[field] = data_point[field]
488
 
489
  return formatted
490
 
491
+ def _export_json(self, data: List[Dict[str, Any]]) -> str:
492
+ """Export as JSON"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
494
  filename = f"dataset_{timestamp}.json"
495
 
496
  with open(filename, 'w', encoding='utf-8') as f:
497
+ json.dump(data, f, indent=2, ensure_ascii=False)
498
 
499
  return filename
500
 
501
+ def _export_csv(self, data: List[Dict[str, Any]]) -> str:
502
+ """Export as CSV"""
503
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
504
  filename = f"dataset_{timestamp}.csv"
505
 
506
+ df = pd.DataFrame(data)
507
  df.to_csv(filename, index=False)
508
 
509
  return filename
510
 
511
+ def _export_jsonl(self, data: List[Dict[str, Any]]) -> str:
512
+ """Export as JSONL"""
513
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
514
  filename = f"dataset_{timestamp}.jsonl"
515
 
516
  with open(filename, 'w', encoding='utf-8') as f:
517
+ for item in data:
518
  f.write(json.dumps(item, ensure_ascii=False) + '\n')
519
 
520
  return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
+ def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
523
+ """Export as HuggingFace Dataset"""
524
+ if not HAS_DATASETS:
525
+ raise ImportError("datasets library not available")
526
+
527
+ dataset = Dataset.from_list(data)
528
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
529
+ dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
530
+
531
+ dataset.save_to_disk(dataset_name)
532
+ return dataset_name
533
+
534
+ class DatasetStudio:
535
+ """Main application orchestrator"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
537
+ def __init__(self):
538
+ self.scraper = WebScraperEngine()
539
+ self.processor = DataProcessor()
540
+ self.annotator = AnnotationEngine()
541
+ self.exporter = DatasetExporter()
542
+
543
+ # Application state
544
+ self.scraped_items = []
545
+ self.processed_items = []
546
+ self.current_project = None
547
+ self.annotation_state = {}
548
+
549
+ logger.info("✅ DatasetStudio initialized successfully")
550
 
551
+ def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]:
552
+ """Start new project"""
553
+ self.current_project = {
554
+ 'name': project_name,
555
+ 'template': template_type,
556
+ 'created_at': datetime.now().isoformat(),
557
+ 'id': str(uuid.uuid4())
558
+ }
559
+
560
+ self.scraped_items = []
561
+ self.processed_items = []
562
+ self.annotation_state = {}
563
+
564
+ logger.info(f"📋 New project: {project_name}")
565
+ return self.current_project
566
 
567
+ def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
568
+ """Scrape URLs"""
569
+ url_list = [url.strip() for url in urls if url.strip()]
570
+
571
+ if not url_list:
572
+ return 0, ["No valid URLs provided"]
573
+
574
+ logger.info(f"🕷️ Scraping {len(url_list)} URLs")
575
+ self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)
576
+
577
+ success = len(self.scraped_items)
578
+ failed = len(url_list) - success
579
+
580
+ errors = []
581
+ if failed > 0:
582
+ errors.append(f"{failed} URLs failed")
583
+
584
+ logger.info(f"✅ Scraped {success}, failed {failed}")
585
+ return success, errors
586
 
587
+ def process_data(self, options: Dict[str, bool]) -> int:
588
+ """Process scraped data"""
589
+ if not self.scraped_items:
590
+ return 0
591
+
592
+ logger.info(f"⚙️ Processing {len(self.scraped_items)} items")
593
+ self.processed_items = self.processor.process_items(self.scraped_items, options)
594
+
595
+ logger.info(f"✅ Processed {len(self.processed_items)} items")
596
+ return len(self.processed_items)
597
 
598
+ def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]:
599
+ """Get data preview"""
600
+ items = self.processed_items or self.scraped_items
601
+
602
+ preview = []
603
+ for item in items[:num_items]:
604
+ preview.append({
605
+ 'title': item.title,
606
+ 'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content,
607
+ 'word_count': item.word_count,
608
+ 'quality_score': round(item.quality_score, 2),
609
+ 'url': item.url
610
+ })
611
+
612
+ return preview
613
 
614
+ def get_data_statistics(self) -> Dict[str, Any]:
615
+ """Get dataset statistics"""
616
+ items = self.processed_items or self.scraped_items
617
+
618
+ if not items:
619
+ return {}
620
+
621
+ word_counts = [item.word_count for item in items]
622
+ quality_scores = [item.quality_score for item in items]
623
+
624
+ return {
625
+ 'total_items': len(items),
626
+ 'avg_word_count': round(np.mean(word_counts)),
627
+ 'avg_quality_score': round(np.mean(quality_scores), 2),
628
+ 'word_count_range': [min(word_counts), max(word_counts)],
629
+ 'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)],
630
+ 'languages': list(set(item.language for item in items)),
631
+ 'domains': list(set(urlparse(item.url).netloc for item in items))
632
+ }
633
 
634
+ def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str:
635
+ """Export dataset"""
636
+ if not self.processed_items and not self.scraped_items:
637
+ raise ValueError("No data to export")
638
+
639
+ items = self.processed_items or self.scraped_items
640
+ template = self.annotator.templates.get(template_name)
641
+
642
+ if not template:
643
+ raise ValueError(f"Unknown template: {template_name}")
644
+
645
+ logger.info(f"📤 Exporting {len(items)} items")
646
+ return self.exporter.export_dataset(items, template, export_format, annotations)
647
+
648
+ def create_modern_interface():
649
+ """Create the modern Gradio interface"""
650
 
651
+ # Initialize studio
652
+ studio = DatasetStudio()
 
 
 
 
 
 
653
 
654
+ # Custom CSS
655
+ css = """
656
+ .gradio-container { max-width: 1400px; margin: auto; }
657
+ .studio-header {
658
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
659
+ color: white; padding: 2rem; border-radius: 15px;
660
+ margin-bottom: 2rem; text-align: center;
661
  }
662
+ .workflow-card {
663
+ background: #f8f9ff; border: 2px solid #e1e5ff;
664
+ border-radius: 12px; padding: 1.5rem; margin: 1rem 0;
 
 
 
 
 
665
  }
666
+ .step-header {
667
+ font-size: 1.2em; font-weight: 600; color: #4c51bf;
668
+ margin-bottom: 1rem;
 
 
 
 
 
669
  }
670
  """
671
 
 
672
  project_state = gr.State({})
673
 
674
+ with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
675
 
676
  # Header
677
  gr.HTML("""
678
  <div class="studio-header">
679
  <h1>🚀 AI Dataset Studio</h1>
680
+ <p>Create high-quality training datasets without coding</p>
 
681
  </div>
682
  """)
683
 
 
684
  with gr.Tabs() as main_tabs:
685
 
686
+ # Project Setup
687
+ with gr.Tab("🎯 Project Setup"):
688
+ gr.HTML('<div class="step-header">Step 1: Create Your Project</div>')
689
 
690
  with gr.Row():
691
  with gr.Column(scale=2):
 
 
 
 
 
 
 
692
  project_name = gr.Textbox(
693
  label="Project Name",
694
+ placeholder="My Dataset Project",
695
+ value="News Analysis Dataset"
696
  )
697
 
 
 
 
698
  template_choice = gr.Radio(
699
  choices=[
700
  ("📊 Text Classification", "text_classification"),
 
704
  ("📝 Text Summarization", "summarization")
705
  ],
706
  label="Dataset Type",
707
+ value="text_classification"
 
 
 
 
 
 
 
708
  )
709
 
710
+ create_project_btn = gr.Button("🚀 Create Project", variant="primary")
711
  project_status = gr.Markdown("")
712
 
713
  with gr.Column(scale=1):
714
  gr.HTML("""
715
  <div class="workflow-card">
716
  <h3>💡 Template Guide</h3>
717
+ <p><strong>Text Classification:</strong> Categorize content</p>
718
+ <p><strong>Sentiment Analysis:</strong> Analyze emotions</p>
719
+ <p><strong>Named Entity Recognition:</strong> Identify entities</p>
720
+ <p><strong>Question Answering:</strong> Create Q&A pairs</p>
721
+ <p><strong>Summarization:</strong> Generate summaries</p>
 
 
 
 
 
 
 
 
 
 
 
 
722
  </div>
723
  """)
724
 
725
+ # Data Collection
726
+ with gr.Tab("🕷️ Data Collection"):
727
+ gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>')
728
 
729
  with gr.Row():
730
  with gr.Column(scale=2):
731
+ urls_input = gr.Textbox(
732
+ label="URLs to Scrape (one per line)",
733
+ placeholder="https://example.com/article1\nhttps://example.com/article2",
734
+ lines=8
735
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
 
737
+ scrape_btn = gr.Button("🚀 Start Scraping", variant="primary")
 
738
  scraping_status = gr.Markdown("")
739
 
740
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  collection_stats = gr.HTML("")
742
 
743
+ # Data Processing
744
+ with gr.Tab("⚙️ Data Processing"):
745
+ gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>')
746
 
747
  with gr.Row():
748
  with gr.Column(scale=2):
 
 
 
 
 
 
 
 
749
  with gr.Row():
750
  with gr.Column():
751
+ clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True)
752
+ quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True)
753
  detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
754
 
755
  with gr.Column():
756
  add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
757
  extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)
 
758
 
759
+ process_btn = gr.Button("⚙️ Process Data", variant="primary")
760
  processing_status = gr.Markdown("")
761
 
762
  with gr.Column(scale=1):
 
 
 
 
 
 
 
763
  processing_stats = gr.HTML("")
764
 
765
+ # Data Preview
766
+ with gr.Tab("👀 Data Preview"):
767
+ gr.HTML('<div class="step-header">Step 4: Review Dataset</div>')
768
 
769
  with gr.Row():
770
  with gr.Column(scale=2):
771
+ refresh_btn = gr.Button("🔄 Refresh Preview", variant="secondary")
 
 
 
 
 
 
 
772
 
 
773
  data_preview = gr.DataFrame(
774
+ headers=["Title", "Content Preview", "Words", "Quality", "URL"],
775
+ label="Dataset Preview"
 
776
  )
777
 
778
  with gr.Column(scale=1):
 
 
 
 
 
 
779
  dataset_stats = gr.JSON(label="Statistics")
780
 
781
+ # Export
782
+ with gr.Tab("📤 Export Dataset"):
783
+ gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>')
784
 
785
  with gr.Row():
786
  with gr.Column(scale=2):
 
 
 
 
 
 
 
 
787
  export_format = gr.Radio(
788
  choices=[
 
789
  ("📄 JSON", "json"),
790
  ("📊 CSV", "csv"),
791
  ("📋 JSONL", "jsonl"),
792
+ ("🤗 HuggingFace", "huggingface_datasets")
793
  ],
794
  label="Export Format",
795
  value="json"
796
  )
797
 
 
798
  export_template = gr.Dropdown(
799
  choices=[
800
  "text_classification",
 
803
  "question_answering",
804
  "summarization"
805
  ],
806
+ label="Template",
807
  value="text_classification"
808
  )
809
 
810
+ export_btn = gr.Button("📤 Export Dataset", variant="primary")
 
 
811
  export_status = gr.Markdown("")
812
+ export_file = gr.File(label="Download", visible=False)
813
 
814
  with gr.Column(scale=1):
815
  gr.HTML("""
816
  <div class="workflow-card">
817
+ <h3>📋 Export Info</h3>
818
+ <p><strong>JSON:</strong> Universal format</p>
819
+ <p><strong>CSV:</strong> Excel compatible</p>
820
+ <p><strong>JSONL:</strong> Line-separated</p>
821
+ <p><strong>HuggingFace:</strong> ML ready</p>
 
 
 
 
 
 
 
 
822
  </div>
823
  """)
824
 
825
  # Event handlers
826
  def create_project(name, template):
 
827
  if not name.strip():
828
  return "❌ Please enter a project name", {}
829
 
830
  project = studio.start_new_project(name.strip(), template)
831
  status = f"""
832
+ ✅ **Project Created!**
833
 
834
+ **Name:** {project['name']}
835
  **Type:** {template.replace('_', ' ').title()}
836
+ **ID:** {project['id'][:8]}...
 
837
 
838
+ 👉 Next: Go to Data Collection tab
839
  """
840
  return status, project
841
 
842
+ def scrape_urls_handler(urls_text, project, progress=gr.Progress()):
 
843
  if not project:
844
+ return "❌ Create a project first", ""
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
+ urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
847
  if not urls:
848
  return "❌ No URLs provided", ""
849
 
 
850
  def progress_callback(pct, msg):
851
  progress(pct, desc=msg)
852
 
853
+ success, errors = studio.scrape_urls(urls, progress_callback)
 
854
 
855
+ if success > 0:
856
+ stats = f"""
857
+ <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
858
  <h3>✅ Scraping Complete</h3>
859
+ <p><strong>{success}</strong> items collected</p>
 
860
  </div>
861
  """
862
 
863
  status = f"""
864
  ✅ **Scraping Complete!**
865
 
866
+ **Success:** {success} URLs
867
+ **Failed:** {len(urls) - success} URLs
868
 
869
+ 👉 Next: Go to Data Processing tab
870
  """
871
 
872
+ return status, stats
873
  else:
874
  return f"❌ Scraping failed: {', '.join(errors)}", ""
875
 
876
+ def process_data_handler(clean, quality, language, sentiment, entities, project):
 
 
877
  if not project:
878
+ return "❌ Create a project first", ""
879
 
880
  if not studio.scraped_items:
881
+ return "❌ No data to process. Scrape URLs first.", ""
882
 
 
883
  options = {
884
+ 'clean_text': clean,
885
+ 'quality_filter': quality,
886
+ 'detect_language': language,
887
+ 'add_sentiment': sentiment,
888
+ 'extract_entities': entities
 
889
  }
890
 
891
+ processed = studio.process_data(options)
 
892
 
893
+ if processed > 0:
894
  stats = studio.get_data_statistics()
895
  stats_html = f"""
896
+ <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
897
  <h3>⚙️ Processing Complete</h3>
898
+ <p><strong>{processed}</strong> items processed</p>
899
+ <p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
 
900
  </div>
901
  """
902
 
903
  status = f"""
904
  ✅ **Processing Complete!**
905
 
906
+ **Processed:** {processed} items
907
+ **Avg Quality:** {stats.get('avg_quality_score', 0)}
 
908
 
909
+ 👉 Next: Check Data Preview tab
910
  """
911
 
912
  return status, stats_html
913
  else:
914
+ return "❌ No items passed filters", ""
915
 
916
  def refresh_preview_handler(project):
 
917
  if not project:
918
  return None, {}
919
 
920
+ preview = studio.get_data_preview()
921
  stats = studio.get_data_statistics()
922
 
923
+ if preview:
 
924
  df_data = []
925
+ for item in preview:
926
  df_data.append([
927
  item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
928
  item['content_preview'],
 
935
 
936
  return None, {}
937
 
938
+ def export_handler(format_type, template, project):
 
939
  if not project:
940
+ return "❌ Create a project first", None
941
 
942
  if not studio.processed_items and not studio.scraped_items:
943
+ return "❌ No data to export", None
944
 
945
  try:
946
+ filename = studio.export_dataset(template, format_type)
 
947
 
948
  status = f"""
949
  ✅ **Export Successful!**
950
 
951
+ **Format:** {format_type}
 
952
  **File:** {filename}
953
 
954
+ 📥 Download link below
955
  """
956
 
957
  return status, filename
 
959
  except Exception as e:
960
  return f"❌ Export failed: {str(e)}", None
961
 
962
+ # Connect events
963
  create_project_btn.click(
964
  fn=create_project,
965
  inputs=[project_name, template_choice],
 
968
 
969
  scrape_btn.click(
970
  fn=scrape_urls_handler,
971
+ inputs=[urls_input, project_state],
972
  outputs=[scraping_status, collection_stats]
973
  )
974
 
975
  process_btn.click(
976
  fn=process_data_handler,
977
  inputs=[clean_text, quality_filter, detect_language,
978
+ add_sentiment, extract_entities, project_state],
979
  outputs=[processing_status, processing_stats]
980
  )
981
 
982
+ refresh_btn.click(
983
  fn=refresh_preview_handler,
984
  inputs=[project_state],
985
  outputs=[data_preview, dataset_stats]
986
  )
987
 
988
  export_btn.click(
989
+ fn=export_handler,
990
  inputs=[export_format, export_template, project_state],
991
  outputs=[export_status, export_file]
992
  )
 
 
 
 
 
 
 
993
 
994
  return interface
995
 
996
+ # Launch application
997
  if __name__ == "__main__":
998
  logger.info("🚀 Starting AI Dataset Studio...")
999
 
1000
+ # Check features
1001
  features = []
1002
  if HAS_TRANSFORMERS:
1003
  features.append("✅ AI Models")
 
1012
  if HAS_DATASETS:
1013
  features.append("✅ HuggingFace Integration")
1014
  else:
1015
+ features.append("⚠️ Standard Export")
1016
 
1017
  logger.info(f"📊 Features: {' | '.join(features)}")
1018
 
1019
  try:
1020
+ # Test DatasetStudio
1021
+ test_studio = DatasetStudio()
1022
+ logger.info("✅ DatasetStudio test passed")
1023
+
1024
  interface = create_modern_interface()
1025
  logger.info("✅ Interface created successfully")
1026
 
 
1028
  server_name="0.0.0.0",
1029
  server_port=7860,
1030
  share=False,
1031
+ show_error=True
 
1032
  )
1033
 
1034
  except Exception as e:
1035
+ logger.error(f"❌ Failed to launch: {e}")
1036
+ logger.error("💡 Try: python app_minimal.py")
1037
  raise