MagicMeWizard commited on
Commit
6918f0f
Β·
verified Β·
1 Parent(s): 6b9c591

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +929 -871
app.py CHANGED
@@ -1,1037 +1,1095 @@
1
  """
2
- AI Dataset Studio - Complete Application
3
- Fixed version with all classes properly defined
4
  """
5
 
6
  import gradio as gr
7
  import pandas as pd
8
- import numpy as np
9
  import json
 
 
 
 
10
  import re
11
- import requests
12
- from bs4 import BeautifulSoup
13
  from urllib.parse import urlparse, urljoin
14
- from datetime import datetime, timedelta
15
- import logging
16
- from typing import Dict, List, Tuple, Optional, Any
17
  from dataclasses import dataclass, asdict
18
- from pathlib import Path
19
- import uuid
20
- import hashlib
21
- import time
22
- from collections import defaultdict
23
- import io
24
 
25
- # Optional imports with fallbacks
 
 
 
 
 
 
 
26
  try:
27
- from transformers import pipeline, AutoTokenizer, AutoModel
28
- HAS_TRANSFORMERS = True
29
- except ImportError:
30
- HAS_TRANSFORMERS = False
 
31
 
32
  try:
33
  import nltk
34
- from nltk.tokenize import sent_tokenize, word_tokenize
 
 
35
  HAS_NLTK = True
36
  except ImportError:
 
37
  HAS_NLTK = False
38
 
39
  try:
40
- from datasets import Dataset, DatasetDict
41
- HAS_DATASETS = True
 
 
42
  except ImportError:
43
- HAS_DATASETS = False
44
-
45
- # Configure logging
46
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
47
- logger = logging.getLogger(__name__)
48
-
49
- # Download NLTK data if available
50
- if HAS_NLTK:
51
- try:
52
- nltk.download('punkt', quiet=True)
53
- nltk.download('stopwords', quiet=True)
54
- nltk.download('averaged_perceptron_tagger', quiet=True)
55
- except:
56
- pass
57
-
58
- @dataclass
59
- class ScrapedItem:
60
- """Data class for scraped content"""
61
- id: str
62
- url: str
63
- title: str
64
- content: str
65
- metadata: Dict[str, Any]
66
- scraped_at: str
67
- word_count: int
68
- language: str = "en"
69
- quality_score: float = 0.0
70
- labels: List[str] = None
71
- annotations: Dict[str, Any] = None
72
 
73
- def __post_init__(self):
74
- if self.labels is None:
75
- self.labels = []
76
- if self.annotations is None:
77
- self.annotations = {}
 
 
 
78
 
79
- @dataclass
80
- class DatasetTemplate:
81
- """Template for dataset creation"""
82
- name: str
83
- description: str
84
- task_type: str
85
- required_fields: List[str]
86
- optional_fields: List[str]
87
- example_format: Dict[str, Any]
88
- instructions: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- class SecurityValidator:
91
- """Security validation for URLs and content"""
 
 
 
92
 
93
- ALLOWED_SCHEMES = {'http', 'https'}
94
- BLOCKED_DOMAINS = {
95
- 'localhost', '127.0.0.1', '0.0.0.0',
96
- '192.168.', '10.', '172.16.', '172.17.',
97
- '172.18.', '172.19.', '172.20.', '172.21.',
98
- '172.22.', '172.23.', '172.24.', '172.25.',
99
- '172.26.', '172.27.', '172.28.', '172.29.',
100
- '172.30.', '172.31.'
101
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- @classmethod
104
- def validate_url(cls, url: str) -> Tuple[bool, str]:
105
- """Validate URL for security concerns"""
 
 
 
106
  try:
107
- parsed = urlparse(url)
108
-
109
- if parsed.scheme not in cls.ALLOWED_SCHEMES:
110
- return False, f"Invalid scheme: {parsed.scheme}"
111
-
112
- hostname = parsed.hostname or ''
113
- if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
114
- return False, "Access to internal networks not allowed"
115
 
116
- if not parsed.netloc:
117
- return False, "Invalid URL format"
118
-
119
- return True, "URL is valid"
 
 
 
 
 
 
 
 
 
 
120
 
121
  except Exception as e:
122
- return False, f"URL validation error: {str(e)}"
123
-
124
- class WebScraperEngine:
125
- """Advanced web scraping engine"""
126
-
127
- def __init__(self):
128
- self.session = requests.Session()
129
- self.session.headers.update({
130
- 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
131
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
132
- 'Accept-Language': 'en-US,en;q=0.5',
133
- 'Connection': 'keep-alive',
134
- })
135
-
136
- def scrape_url(self, url: str) -> Optional[ScrapedItem]:
137
- """Scrape a single URL"""
138
  try:
139
- # Validate URL
140
- is_valid, validation_msg = SecurityValidator.validate_url(url)
141
- if not is_valid:
142
- raise ValueError(f"Security validation failed: {validation_msg}")
 
 
 
 
143
 
144
- # Fetch content
145
- response = self.session.get(url, timeout=15)
146
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # Parse HTML
149
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
150
 
151
- # Extract data
152
- title = self._extract_title(soup)
153
- content = self._extract_content(soup)
154
- metadata = self._extract_metadata(soup, response)
155
 
156
- # Create item
157
- item = ScrapedItem(
158
- id=str(uuid.uuid4()),
159
- url=url,
160
- title=title,
161
- content=content,
162
- metadata=metadata,
163
- scraped_at=datetime.now().isoformat(),
164
- word_count=len(content.split()),
165
- quality_score=self._assess_quality(content)
166
  )
167
 
168
- return item
 
169
 
170
- except Exception as e:
171
- logger.error(f"Failed to scrape {url}: {e}")
172
- return None
173
-
174
- def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
175
- """Scrape multiple URLs"""
176
- results = []
177
- total = len(urls)
178
-
179
- for i, url in enumerate(urls):
180
- if progress_callback:
181
- progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")
182
 
183
- item = self.scrape_url(url)
184
- if item:
185
- results.append(item)
186
 
187
- time.sleep(1) # Rate limiting
188
-
189
- return results
 
 
190
 
191
- def _extract_title(self, soup: BeautifulSoup) -> str:
192
- """Extract page title"""
193
- title_tag = soup.find('title')
194
- if title_tag:
195
- return title_tag.get_text().strip()
196
-
197
- h1_tag = soup.find('h1')
198
- if h1_tag:
199
- return h1_tag.get_text().strip()
200
-
201
- return "Untitled"
202
 
203
- def _extract_content(self, soup: BeautifulSoup) -> str:
204
- """Extract main content"""
205
- # Remove unwanted elements
206
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
207
- element.decompose()
208
-
209
- # Try content selectors
210
- content_selectors = [
211
- 'article', 'main', '.content', '.post-content',
212
- '.entry-content', '.article-body'
213
- ]
 
 
 
 
214
 
215
- for selector in content_selectors:
216
- element = soup.select_one(selector)
217
- if element:
218
- text = element.get_text(separator=' ', strip=True)
219
- if len(text) > 200:
220
- return self._clean_text(text)
221
 
222
- # Fallback to body
223
- body = soup.find('body')
224
- if body:
225
- return self._clean_text(body.get_text(separator=' ', strip=True))
 
226
 
227
- return self._clean_text(soup.get_text(separator=' ', strip=True))
228
 
229
- def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
230
- """Extract metadata"""
231
- metadata = {
232
- 'domain': urlparse(response.url).netloc,
233
- 'status_code': response.status_code,
234
- 'extracted_at': datetime.now().isoformat()
235
- }
236
 
237
- # Extract meta tags
238
- for tag in ['description', 'keywords', 'author']:
239
- element = soup.find('meta', attrs={'name': tag})
240
- if element:
241
- metadata[tag] = element.get('content', '')
 
242
 
243
- return metadata
244
-
245
- def _clean_text(self, text: str) -> str:
246
- """Clean extracted text"""
247
- text = re.sub(r'\s+', ' ', text)
248
- text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
249
- text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
250
- return text.strip()
251
-
252
- def _assess_quality(self, content: str) -> float:
253
- """Assess content quality"""
254
- if not content:
255
- return 0.0
256
-
257
- score = 0.0
258
- word_count = len(content.split())
259
 
260
- if word_count >= 50:
261
- score += 0.4
262
- elif word_count >= 20:
263
- score += 0.2
264
 
265
- sentence_count = len(re.split(r'[.!?]+', content))
266
- if sentence_count >= 3:
267
- score += 0.3
268
 
269
- if re.search(r'[A-Z][a-z]+', content):
270
- score += 0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- return min(score, 1.0)
273
-
274
- class DataProcessor:
275
- """Data processing pipeline"""
276
-
277
- def __init__(self):
278
- self.sentiment_analyzer = None
279
- self.ner_model = None
280
- self._load_models()
281
 
282
- def _load_models(self):
283
- """Load NLP models"""
284
- if not HAS_TRANSFORMERS:
285
- logger.warning("⚠️ Transformers not available")
286
- return
287
 
288
- try:
289
- self.sentiment_analyzer = pipeline(
290
- "sentiment-analysis",
291
- model="cardiffnlp/twitter-roberta-base-sentiment-latest"
292
- )
293
- logger.info("βœ… Sentiment model loaded")
294
- except Exception as e:
295
- logger.warning(f"⚠️ Could not load sentiment model: {e}")
296
-
297
- def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]:
298
- """Process scraped items"""
299
- processed = []
300
 
301
- for item in items:
 
 
 
 
302
  try:
303
- # Clean text
304
- if options.get('clean_text', True):
305
- item.content = self._clean_text_advanced(item.content)
306
 
307
- # Quality filter
308
- if options.get('quality_filter', True) and item.quality_score < 0.3:
309
- continue
310
 
311
- # Add sentiment
312
- if options.get('add_sentiment', False) and self.sentiment_analyzer:
313
- sentiment = self._analyze_sentiment(item.content)
314
- item.metadata['sentiment'] = sentiment
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- # Language detection
317
- if options.get('detect_language', True):
318
- item.language = self._detect_language(item.content)
319
-
320
- processed.append(item)
321
 
322
  except Exception as e:
323
- logger.error(f"Error processing item {item.id}: {e}")
324
  continue
325
 
326
- return processed
327
-
328
- def _clean_text_advanced(self, text: str) -> str:
329
- """Advanced text cleaning"""
330
- text = re.sub(r'http\S+|www\.\S+', '', text)
331
- text = re.sub(r'\S+@\S+', '', text)
332
- text = re.sub(r'\s+', ' ', text)
333
- return text.strip()
334
-
335
- def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
336
- """Analyze sentiment"""
337
- try:
338
- text_sample = text[:512]
339
- result = self.sentiment_analyzer(text_sample)[0]
340
- return {
341
- 'label': result['label'],
342
- 'score': result['score']
343
- }
344
- except:
345
- return {'label': 'UNKNOWN', 'score': 0.0}
346
-
347
- def _detect_language(self, text: str) -> str:
348
- """Simple language detection"""
349
- if re.search(r'[Π°-яё]', text.lower()):
350
- return 'ru'
351
- elif re.search(r'[ñÑéíóúü]', text.lower()):
352
- return 'es'
353
- return 'en'
354
-
355
- class AnnotationEngine:
356
- """Annotation tools for dataset creation"""
357
-
358
- def __init__(self):
359
- self.templates = self._load_templates()
360
-
361
- def _load_templates(self) -> Dict[str, DatasetTemplate]:
362
- """Load dataset templates"""
363
- templates = {
364
- 'text_classification': DatasetTemplate(
365
- name="Text Classification",
366
- description="Classify text into categories",
367
- task_type="classification",
368
- required_fields=["text", "label"],
369
- optional_fields=["confidence", "metadata"],
370
- example_format={"text": "Sample text", "label": "positive"},
371
- instructions="Label each text with appropriate category"
372
- ),
373
- 'sentiment_analysis': DatasetTemplate(
374
- name="Sentiment Analysis",
375
- description="Analyze emotional tone",
376
- task_type="classification",
377
- required_fields=["text", "sentiment"],
378
- optional_fields=["confidence", "aspects"],
379
- example_format={"text": "I love this!", "sentiment": "positive"},
380
- instructions="Classify sentiment as positive, negative, or neutral"
381
- ),
382
- 'named_entity_recognition': DatasetTemplate(
383
- name="Named Entity Recognition",
384
- description="Identify named entities",
385
- task_type="ner",
386
- required_fields=["text", "entities"],
387
- optional_fields=["metadata"],
388
- example_format={
389
- "text": "John works at OpenAI",
390
- "entities": [{"text": "John", "label": "PERSON"}]
391
- },
392
- instructions="Mark all named entities"
393
- ),
394
- 'question_answering': DatasetTemplate(
395
- name="Question Answering",
396
- description="Create Q&A pairs",
397
- task_type="qa",
398
- required_fields=["context", "question", "answer"],
399
- optional_fields=["answer_start", "metadata"],
400
- example_format={
401
- "context": "The capital of France is Paris.",
402
- "question": "What is the capital of France?",
403
- "answer": "Paris"
404
- },
405
- instructions="Create meaningful questions and answers"
406
- ),
407
- 'summarization': DatasetTemplate(
408
- name="Text Summarization",
409
- description="Create summaries",
410
- task_type="summarization",
411
- required_fields=["text", "summary"],
412
- optional_fields=["summary_type", "length"],
413
- example_format={
414
- "text": "Long article text...",
415
- "summary": "Brief summary"
416
- },
417
- instructions="Write clear, concise summaries"
418
- )
419
- }
420
- return templates
421
-
422
- class DatasetExporter:
423
- """Export datasets in various formats"""
424
-
425
- def __init__(self):
426
- self.supported_formats = [
427
- 'json', 'csv', 'jsonl', 'huggingface_datasets'
428
- ]
429
 
430
- def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
431
- export_format: str, annotations: Dict[str, Any] = None) -> str:
432
- """Export dataset"""
433
- try:
434
- dataset_data = self._prepare_data(items, template, annotations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
- if export_format == 'json':
437
- return self._export_json(dataset_data)
438
- elif export_format == 'csv':
439
- return self._export_csv(dataset_data)
440
- elif export_format == 'jsonl':
441
- return self._export_jsonl(dataset_data)
442
- elif export_format == 'huggingface_datasets':
443
- return self._export_huggingface(dataset_data, template)
 
 
 
 
 
 
 
 
 
 
 
 
444
  else:
445
- raise ValueError(f"Unsupported format: {export_format}")
446
-
447
- except Exception as e:
448
- logger.error(f"Export failed: {e}")
449
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
- def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate,
452
- annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
453
- """Prepare data according to template"""
454
- dataset_data = []
455
-
456
- for item in items:
457
- data_point = {
458
- 'text': item.content,
459
- 'title': item.title,
460
- 'url': item.url,
461
- 'metadata': item.metadata
462
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
- if annotations and item.id in annotations:
465
- data_point.update(annotations[item.id])
 
466
 
467
- formatted = self._format_for_template(data_point, template)
468
- if formatted:
469
- dataset_data.append(formatted)
 
 
 
470
 
471
- return dataset_data
472
 
473
- def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
474
- """Format data according to template"""
475
- formatted = {}
476
-
477
- for field in template.required_fields:
478
- if field in data_point:
479
- formatted[field] = data_point[field]
480
- elif field == 'text' and 'content' in data_point:
481
- formatted[field] = data_point['content']
482
- else:
483
- return None
484
 
485
- for field in template.optional_fields:
486
- if field in data_point:
487
- formatted[field] = data_point[field]
488
 
489
- return formatted
490
-
491
- def _export_json(self, data: List[Dict[str, Any]]) -> str:
492
- """Export as JSON"""
493
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
494
- filename = f"dataset_{timestamp}.json"
495
 
496
- with open(filename, 'w', encoding='utf-8') as f:
497
- json.dump(data, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
- return filename
500
 
501
- def _export_csv(self, data: List[Dict[str, Any]]) -> str:
502
- """Export as CSV"""
503
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
504
- filename = f"dataset_{timestamp}.csv"
505
 
506
- df = pd.DataFrame(data)
507
- df.to_csv(filename, index=False)
 
508
 
509
- return filename
510
-
511
- def _export_jsonl(self, data: List[Dict[str, Any]]) -> str:
512
- """Export as JSONL"""
513
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
514
- filename = f"dataset_{timestamp}.jsonl"
515
 
516
- with open(filename, 'w', encoding='utf-8') as f:
517
- for item in data:
518
- f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
- return filename
 
 
 
 
 
 
521
 
522
- def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
523
- """Export as HuggingFace Dataset"""
524
- if not HAS_DATASETS:
525
- raise ImportError("datasets library not available")
526
 
527
- dataset = Dataset.from_list(data)
528
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
529
- dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
530
 
531
- dataset.save_to_disk(dataset_name)
532
- return dataset_name
533
-
534
- class DatasetStudio:
535
- """Main application orchestrator"""
 
 
 
536
 
537
- def __init__(self):
538
- self.scraper = WebScraperEngine()
539
- self.processor = DataProcessor()
540
- self.annotator = AnnotationEngine()
541
- self.exporter = DatasetExporter()
542
-
543
- # Application state
544
- self.scraped_items = []
545
- self.processed_items = []
546
- self.current_project = None
547
- self.annotation_state = {}
548
 
549
- logger.info("βœ… DatasetStudio initialized successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
- def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]:
552
- """Start new project"""
553
- self.current_project = {
554
- 'name': project_name,
555
- 'template': template_type,
556
- 'created_at': datetime.now().isoformat(),
557
- 'id': str(uuid.uuid4())
558
- }
 
 
 
 
 
 
559
 
560
- self.scraped_items = []
561
- self.processed_items = []
562
- self.annotation_state = {}
563
 
564
- logger.info(f"πŸ“‹ New project: {project_name}")
565
- return self.current_project
566
 
567
- def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
568
- """Scrape URLs"""
569
- url_list = [url.strip() for url in urls if url.strip()]
570
-
571
- if not url_list:
572
- return 0, ["No valid URLs provided"]
573
 
574
- logger.info(f"πŸ•·οΈ Scraping {len(url_list)} URLs")
575
- self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)
576
 
577
- success = len(self.scraped_items)
578
- failed = len(url_list) - success
 
 
579
 
580
- errors = []
581
- if failed > 0:
582
- errors.append(f"{failed} URLs failed")
 
583
 
584
- logger.info(f"βœ… Scraped {success}, failed {failed}")
585
- return success, errors
586
 
587
- def process_data(self, options: Dict[str, bool]) -> int:
588
- """Process scraped data"""
589
- if not self.scraped_items:
590
- return 0
591
-
592
- logger.info(f"βš™οΈ Processing {len(self.scraped_items)} items")
593
- self.processed_items = self.processor.process_items(self.scraped_items, options)
594
-
595
- logger.info(f"βœ… Processed {len(self.processed_items)} items")
596
- return len(self.processed_items)
 
 
 
 
 
 
597
 
598
- def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]:
599
- """Get data preview"""
600
- items = self.processed_items or self.scraped_items
601
-
602
- preview = []
603
- for item in items[:num_items]:
604
- preview.append({
605
- 'title': item.title,
606
- 'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content,
607
- 'word_count': item.word_count,
608
- 'quality_score': round(item.quality_score, 2),
609
- 'url': item.url
610
- })
611
-
612
- return preview
613
 
614
- def get_data_statistics(self) -> Dict[str, Any]:
615
- """Get dataset statistics"""
616
- items = self.processed_items or self.scraped_items
617
-
618
- if not items:
619
- return {}
620
-
621
- word_counts = [item.word_count for item in items]
622
- quality_scores = [item.quality_score for item in items]
623
-
624
- return {
625
- 'total_items': len(items),
626
- 'avg_word_count': round(np.mean(word_counts)),
627
- 'avg_quality_score': round(np.mean(quality_scores), 2),
628
- 'word_count_range': [min(word_counts), max(word_counts)],
629
- 'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)],
630
- 'languages': list(set(item.language for item in items)),
631
- 'domains': list(set(urlparse(item.url).netloc for item in items))
632
- }
633
 
634
- def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str:
635
- """Export dataset"""
636
- if not self.processed_items and not self.scraped_items:
637
- raise ValueError("No data to export")
638
 
639
- items = self.processed_items or self.scraped_items
640
- template = self.annotator.templates.get(template_name)
641
 
642
- if not template:
643
- raise ValueError(f"Unknown template: {template_name}")
644
 
645
- logger.info(f"πŸ“€ Exporting {len(items)} items")
646
- return self.exporter.export_dataset(items, template, export_format, annotations)
647
 
648
  def create_modern_interface():
649
  """Create the modern Gradio interface"""
 
650
 
651
- # Initialize studio
652
  studio = DatasetStudio()
653
 
654
- # Custom CSS
655
- css = """
656
- .gradio-container { max-width: 1400px; margin: auto; }
657
- .studio-header {
658
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
659
- color: white; padding: 2rem; border-radius: 15px;
660
- margin-bottom: 2rem; text-align: center;
661
  }
662
- .workflow-card {
663
- background: #f8f9ff; border: 2px solid #e1e5ff;
664
- border-radius: 12px; padding: 1.5rem; margin: 1rem 0;
 
 
 
 
 
665
  }
 
666
  .step-header {
667
- font-size: 1.2em; font-weight: 600; color: #4c51bf;
668
- margin-bottom: 1rem;
 
 
 
 
669
  }
670
- """
671
 
672
- project_state = gr.State({})
 
 
 
 
 
 
673
 
674
- with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
675
-
676
- # Header
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
  gr.HTML("""
678
- <div class="studio-header">
679
  <h1>πŸš€ AI Dataset Studio</h1>
680
- <p>Create high-quality training datasets without coding</p>
 
681
  </div>
682
  """)
683
 
684
- with gr.Tabs() as main_tabs:
685
-
686
- # Project Setup
687
- with gr.Tab("🎯 Project Setup"):
688
- gr.HTML('<div class="step-header">Step 1: Create Your Project</div>')
689
 
690
  with gr.Row():
691
  with gr.Column(scale=2):
692
  project_name = gr.Textbox(
693
- label="Project Name",
694
- placeholder="My Dataset Project",
695
- value="News Analysis Dataset"
696
  )
697
 
698
- template_choice = gr.Radio(
699
- choices=[
700
- ("πŸ“Š Text Classification", "text_classification"),
701
- ("😊 Sentiment Analysis", "sentiment_analysis"),
702
- ("πŸ‘₯ Named Entity Recognition", "named_entity_recognition"),
703
- ("❓ Question Answering", "question_answering"),
704
- ("πŸ“ Text Summarization", "summarization")
705
- ],
706
- label="Dataset Type",
707
- value="text_classification"
708
  )
709
-
710
- create_project_btn = gr.Button("πŸš€ Create Project", variant="primary")
711
- project_status = gr.Markdown("")
712
 
713
  with gr.Column(scale=1):
714
- gr.HTML("""
715
- <div class="workflow-card">
716
- <h3>πŸ’‘ Template Guide</h3>
717
- <p><strong>Text Classification:</strong> Categorize content</p>
718
- <p><strong>Sentiment Analysis:</strong> Analyze emotions</p>
719
- <p><strong>Named Entity Recognition:</strong> Identify entities</p>
720
- <p><strong>Question Answering:</strong> Create Q&A pairs</p>
721
- <p><strong>Summarization:</strong> Generate summaries</p>
722
- </div>
723
- """)
724
-
725
- # Data Collection
726
- with gr.Tab("πŸ•·οΈ Data Collection"):
727
- gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>')
728
-
729
- with gr.Row():
730
- with gr.Column(scale=2):
731
- urls_input = gr.Textbox(
732
- label="URLs to Scrape (one per line)",
733
- placeholder="https://example.com/article1\nhttps://example.com/article2",
734
- lines=8
735
- )
736
-
737
- scrape_btn = gr.Button("πŸš€ Start Scraping", variant="primary")
738
- scraping_status = gr.Markdown("")
739
-
740
- with gr.Column(scale=1):
741
- collection_stats = gr.HTML("")
742
-
743
- # Data Processing
744
- with gr.Tab("βš™οΈ Data Processing"):
745
- gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>')
746
-
747
- with gr.Row():
748
- with gr.Column(scale=2):
749
- with gr.Row():
750
- with gr.Column():
751
- clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True)
752
- quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True)
753
- detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
754
-
755
- with gr.Column():
756
- add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
757
- extract_entities = gr.Checkbox(label="πŸ‘₯ Entity Extraction", value=False)
758
-
759
- process_btn = gr.Button("βš™οΈ Process Data", variant="primary")
760
- processing_status = gr.Markdown("")
761
-
762
- with gr.Column(scale=1):
763
- processing_stats = gr.HTML("")
764
-
765
- # Data Preview
766
- with gr.Tab("πŸ‘€ Data Preview"):
767
- gr.HTML('<div class="step-header">Step 4: Review Dataset</div>')
768
-
769
- with gr.Row():
770
- with gr.Column(scale=2):
771
- refresh_btn = gr.Button("πŸ”„ Refresh Preview", variant="secondary")
772
-
773
- data_preview = gr.DataFrame(
774
- headers=["Title", "Content Preview", "Words", "Quality", "URL"],
775
- label="Dataset Preview"
776
- )
777
-
778
- with gr.Column(scale=1):
779
- dataset_stats = gr.JSON(label="Statistics")
780
-
781
- # Export
782
- with gr.Tab("πŸ“€ Export Dataset"):
783
- gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>')
784
-
785
- with gr.Row():
786
- with gr.Column(scale=2):
787
- export_format = gr.Radio(
788
- choices=[
789
- ("πŸ“„ JSON", "json"),
790
- ("πŸ“Š CSV", "csv"),
791
- ("πŸ“‹ JSONL", "jsonl"),
792
- ("πŸ€— HuggingFace", "huggingface_datasets")
793
- ],
794
- label="Export Format",
795
- value="json"
796
- )
797
 
798
- export_template = gr.Dropdown(
799
- choices=[
800
- "text_classification",
801
- "sentiment_analysis",
802
- "named_entity_recognition",
803
- "question_answering",
804
- "summarization"
805
- ],
806
- label="Template",
807
- value="text_classification"
808
  )
809
 
810
- export_btn = gr.Button("πŸ“€ Export Dataset", variant="primary")
811
- export_status = gr.Markdown("")
812
- export_file = gr.File(label="Download", visible=False)
813
-
814
- with gr.Column(scale=1):
815
- gr.HTML("""
816
- <div class="workflow-card">
817
- <h3>πŸ“‹ Export Info</h3>
818
- <p><strong>JSON:</strong> Universal format</p>
819
- <p><strong>CSV:</strong> Excel compatible</p>
820
- <p><strong>JSONL:</strong> Line-separated</p>
821
- <p><strong>HuggingFace:</strong> ML ready</p>
822
- </div>
823
- """)
824
-
825
- # Event handlers
826
- def create_project(name, template):
827
- if not name.strip():
828
- return "❌ Please enter a project name", {}
829
-
830
- project = studio.start_new_project(name.strip(), template)
831
- status = f"""
832
- βœ… **Project Created!**
833
-
834
- **Name:** {project['name']}
835
- **Type:** {template.replace('_', ' ').title()}
836
- **ID:** {project['id'][:8]}...
837
-
838
- πŸ‘‰ Next: Go to Data Collection tab
839
- """
840
- return status, project
841
-
842
- def scrape_urls_handler(urls_text, project, progress=gr.Progress()):
843
- if not project:
844
- return "❌ Create a project first", ""
845
-
846
- urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
847
- if not urls:
848
- return "❌ No URLs provided", ""
849
-
850
- def progress_callback(pct, msg):
851
- progress(pct, desc=msg)
852
-
853
- success, errors = studio.scrape_urls(urls, progress_callback)
854
-
855
- if success > 0:
856
- stats = f"""
857
- <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
858
- <h3>βœ… Scraping Complete</h3>
859
- <p><strong>{success}</strong> items collected</p>
860
- </div>
861
- """
862
 
863
- status = f"""
864
- βœ… **Scraping Complete!**
865
 
866
- **Success:** {success} URLs
867
- **Failed:** {len(urls) - success} URLs
 
 
 
 
 
 
 
 
 
868
 
869
- πŸ‘‰ Next: Go to Data Processing tab
870
- """
871
-
872
- return status, stats
873
- else:
874
- return f"❌ Scraping failed: {', '.join(errors)}", ""
875
-
876
- def process_data_handler(clean, quality, language, sentiment, entities, project):
877
- if not project:
878
- return "❌ Create a project first", ""
879
-
880
- if not studio.scraped_items:
881
- return "❌ No data to process. Scrape URLs first.", ""
882
-
883
- options = {
884
- 'clean_text': clean,
885
- 'quality_filter': quality,
886
- 'detect_language': language,
887
- 'add_sentiment': sentiment,
888
- 'extract_entities': entities
889
- }
890
 
891
- processed = studio.process_data(options)
892
-
893
- if processed > 0:
894
- stats = studio.get_data_statistics()
895
- stats_html = f"""
896
- <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
897
- <h3>βš™οΈ Processing Complete</h3>
898
- <p><strong>{processed}</strong> items processed</p>
899
- <p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
900
- </div>
901
- """
902
-
903
- status = f"""
904
- βœ… **Processing Complete!**
905
 
906
- **Processed:** {processed} items
907
- **Avg Quality:** {stats.get('avg_quality_score', 0)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
908
 
909
- πŸ‘‰ Next: Check Data Preview tab
910
- """
911
 
912
- return status, stats_html
913
- else:
914
- return "❌ No items passed filters", ""
915
-
916
- def refresh_preview_handler(project):
917
- if not project:
918
- return None, {}
919
-
920
- preview = studio.get_data_preview()
921
- stats = studio.get_data_statistics()
922
-
923
- if preview:
924
- df_data = []
925
- for item in preview:
926
- df_data.append([
927
- item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
928
- item['content_preview'],
929
- item['word_count'],
930
- item['quality_score'],
931
- item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
932
- ])
933
 
934
- return df_data, stats
935
-
936
- return None, {}
937
-
938
- def export_handler(format_type, template, project):
939
- if not project:
940
- return "❌ Create a project first", None
941
-
942
- if not studio.processed_items and not studio.scraped_items:
943
- return "❌ No data to export", None
944
 
945
- try:
946
- filename = studio.export_dataset(template, format_type)
 
947
 
948
- status = f"""
949
- βœ… **Export Successful!**
 
 
 
 
950
 
951
- **Format:** {format_type}
952
- **File:** {filename}
953
-
954
- πŸ“₯ Download link below
955
- """
 
 
956
 
957
- return status, filename
 
 
 
 
 
958
 
959
- except Exception as e:
960
- return f"❌ Export failed: {str(e)}", None
 
961
 
962
- # Connect events
963
  create_project_btn.click(
964
- fn=create_project,
965
- inputs=[project_name, template_choice],
966
- outputs=[project_status, project_state]
967
  )
968
 
 
 
 
 
 
 
 
 
 
 
 
 
 
969
  scrape_btn.click(
970
- fn=scrape_urls_handler,
971
- inputs=[urls_input, project_state],
972
- outputs=[scraping_status, collection_stats]
973
  )
974
 
975
  process_btn.click(
976
- fn=process_data_handler,
977
- inputs=[clean_text, quality_filter, detect_language,
978
- add_sentiment, extract_entities, project_state],
979
- outputs=[processing_status, processing_stats]
980
- )
981
-
982
- refresh_btn.click(
983
- fn=refresh_preview_handler,
984
- inputs=[project_state],
985
- outputs=[data_preview, dataset_stats]
986
  )
987
 
988
  export_btn.click(
989
- fn=export_handler,
990
- inputs=[export_format, export_template, project_state],
991
- outputs=[export_status, export_file]
992
  )
993
 
 
994
  return interface
995
 
996
- # Launch application
997
- if __name__ == "__main__":
998
  logger.info("πŸš€ Starting AI Dataset Studio...")
 
999
 
1000
- # Check features
1001
- features = []
1002
- if HAS_TRANSFORMERS:
1003
- features.append("βœ… AI Models")
1004
- else:
1005
- features.append("⚠️ Basic Processing")
1006
-
1007
- if HAS_NLTK:
1008
- features.append("βœ… Advanced NLP")
1009
- else:
1010
- features.append("⚠️ Basic NLP")
1011
 
1012
- if HAS_DATASETS:
1013
- features.append("βœ… HuggingFace Integration")
1014
- else:
1015
- features.append("⚠️ Standard Export")
1016
 
1017
- logger.info(f"πŸ“Š Features: {' | '.join(features)}")
1018
-
1019
- try:
1020
- # Test DatasetStudio
1021
- test_studio = DatasetStudio()
1022
- logger.info("βœ… DatasetStudio test passed")
1023
-
1024
- interface = create_modern_interface()
1025
- logger.info("βœ… Interface created successfully")
1026
-
1027
  interface.launch(
1028
  server_name="0.0.0.0",
1029
  server_port=7860,
1030
  share=False,
1031
  show_error=True
1032
  )
1033
-
1034
- except Exception as e:
1035
- logger.error(f"❌ Failed to launch: {e}")
1036
- logger.error("πŸ’‘ Try: python app_minimal.py")
1037
- raise
 
1
  """
2
+ πŸš€ AI Dataset Studio with Perplexity AI Integration
3
+ A comprehensive platform for creating high-quality training datasets using AI-powered source discovery
4
  """
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ import requests
9
  import json
10
+ import logging
11
+ import os
12
+ import sys
13
+ import time
14
  import re
15
+ from datetime import datetime
16
+ from typing import List, Dict, Optional, Tuple, Any
17
  from urllib.parse import urlparse, urljoin
 
 
 
18
  from dataclasses import dataclass, asdict
19
+ import traceback
 
 
 
 
 
20
 
21
+ # Configure logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(levelname)s - %(message)s'
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Try to import required packages with fallbacks
29
  try:
30
+ from bs4 import BeautifulSoup
31
+ logger.info("βœ… BeautifulSoup imported successfully")
32
+ except ImportError as e:
33
+ logger.error("❌ Failed to import BeautifulSoup: %s", e)
34
+ sys.exit(1)
35
 
36
  try:
37
  import nltk
38
+ from nltk.corpus import stopwords
39
+ from nltk.tokenize import word_tokenize, sent_tokenize
40
+ logger.info("βœ… NLTK imported successfully")
41
  HAS_NLTK = True
42
  except ImportError:
43
+ logger.warning("⚠️ NLTK not available - using basic text processing")
44
  HAS_NLTK = False
45
 
46
  try:
47
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
48
+ import torch
49
+ logger.info("βœ… Transformers imported successfully")
50
+ HAS_TRANSFORMERS = True
51
  except ImportError:
52
+ logger.warning("⚠️ Transformers not available - using extractive summaries")
53
+ HAS_TRANSFORMERS = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Import Perplexity client
56
+ try:
57
+ from perplexity_client import PerplexityClient, SearchType, SourceResult, SearchResults
58
+ logger.info("βœ… Perplexity client imported successfully")
59
+ HAS_PERPLEXITY = True
60
+ except ImportError:
61
+ logger.warning("⚠️ Perplexity client not available - manual source entry only")
62
+ HAS_PERPLEXITY = False
63
 
64
+ # Dataset templates
65
+ DATASET_TEMPLATES = {
66
+ "sentiment_analysis": {
67
+ "name": "πŸ“Š Sentiment Analysis",
68
+ "description": "Classify text as positive, negative, or neutral",
69
+ "fields": ["text", "sentiment"],
70
+ "example": {"text": "This product is amazing!", "sentiment": "positive"},
71
+ "search_queries": ["product reviews", "customer feedback", "social media posts", "movie reviews"]
72
+ },
73
+ "text_classification": {
74
+ "name": "πŸ“‚ Text Classification",
75
+ "description": "Categorize text into predefined classes",
76
+ "fields": ["text", "category"],
77
+ "example": {"text": "Breaking: Stock market reaches new high", "category": "finance"},
78
+ "search_queries": ["news articles", "blog posts", "academic papers", "forum discussions"]
79
+ },
80
+ "named_entity_recognition": {
81
+ "name": "🏷️ Named Entity Recognition",
82
+ "description": "Identify people, places, organizations in text",
83
+ "fields": ["text", "entities"],
84
+ "example": {"text": "Apple Inc. was founded by Steve Jobs in California",
85
+ "entities": [{"text": "Apple Inc.", "label": "ORG"}, {"text": "Steve Jobs", "label": "PERSON"}]},
86
+ "search_queries": ["news articles", "biographies", "company reports", "wikipedia articles"]
87
+ },
88
+ "question_answering": {
89
+ "name": "❓ Question Answering",
90
+ "description": "Extract answers from context passages",
91
+ "fields": ["context", "question", "answer"],
92
+ "example": {"context": "The capital of France is Paris", "question": "What is the capital of France?", "answer": "Paris"},
93
+ "search_queries": ["FAQ pages", "educational content", "interview transcripts", "knowledge bases"]
94
+ },
95
+ "text_summarization": {
96
+ "name": "πŸ“ Text Summarization",
97
+ "description": "Generate concise summaries of longer texts",
98
+ "fields": ["text", "summary"],
99
+ "example": {"text": "Long article content...", "summary": "Brief summary of key points"},
100
+ "search_queries": ["news articles", "research papers", "blog posts", "reports"]
101
+ },
102
+ "translation": {
103
+ "name": "🌐 Translation",
104
+ "description": "Translate text between languages",
105
+ "fields": ["source_text", "target_text", "source_lang", "target_lang"],
106
+ "example": {"source_text": "Hello world", "target_text": "Hola mundo", "source_lang": "en", "target_lang": "es"},
107
+ "search_queries": ["multilingual websites", "international news", "translation datasets", "parallel corpora"]
108
+ }
109
+ }
110
 
111
+ class DatasetStudio:
112
+ """
113
+ 🎯 Main Dataset Studio Class
114
+ Handles all core functionality for dataset creation
115
+ """
116
 
117
+ def __init__(self):
118
+ """Initialize the Dataset Studio"""
119
+ logger.info("πŸš€ Initializing AI Dataset Studio...")
120
+
121
+ # Initialize components
122
+ self.projects = {}
123
+ self.current_project = None
124
+ self.scraped_data = []
125
+ self.processed_data = []
126
+
127
+ # Initialize AI models if available
128
+ self.sentiment_analyzer = None
129
+ self.summarizer = None
130
+ self.ner_model = None
131
+
132
+ # Initialize Perplexity client
133
+ self.perplexity_client = None
134
+ if HAS_PERPLEXITY:
135
+ try:
136
+ api_key = os.getenv('PERPLEXITY_API_KEY')
137
+ if api_key:
138
+ self.perplexity_client = PerplexityClient(api_key)
139
+ logger.info("βœ… Perplexity AI client initialized")
140
+ else:
141
+ logger.warning("⚠️ PERPLEXITY_API_KEY not found - manual source entry only")
142
+ except Exception as e:
143
+ logger.error(f"❌ Failed to initialize Perplexity client: {e}")
144
+
145
+ self._load_models()
146
+ logger.info("βœ… Dataset Studio initialized successfully")
147
 
148
+ def _load_models(self):
149
+ """Load AI models for processing"""
150
+ if not HAS_TRANSFORMERS:
151
+ logger.info("⚠️ Skipping model loading - transformers not available")
152
+ return
153
+
154
  try:
155
+ # Load sentiment analysis model
156
+ logger.info("πŸ“¦ Loading sentiment analysis model...")
157
+ self.sentiment_analyzer = pipeline(
158
+ "sentiment-analysis",
159
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest",
160
+ return_all_scores=True
161
+ )
162
+ logger.info("βœ… Sentiment analyzer loaded")
163
 
164
+ except Exception as e:
165
+ logger.warning(f"⚠️ Could not load sentiment analyzer: {e}")
166
+
167
+ try:
168
+ # Load summarization model
169
+ logger.info("πŸ“¦ Loading summarization model...")
170
+ self.summarizer = pipeline(
171
+ "summarization",
172
+ model="facebook/bart-large-cnn",
173
+ max_length=150,
174
+ min_length=30,
175
+ do_sample=False
176
+ )
177
+ logger.info("βœ… Summarizer loaded")
178
 
179
  except Exception as e:
180
+ logger.warning(f"⚠️ Could not load summarizer: {e}")
181
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  try:
183
+ # Load NER model
184
+ logger.info("πŸ“¦ Loading NER model...")
185
+ self.ner_model = pipeline(
186
+ "ner",
187
+ model="dbmdz/bert-large-cased-finetuned-conll03-english",
188
+ aggregation_strategy="simple"
189
+ )
190
+ logger.info("βœ… NER model loaded")
191
 
192
+ except Exception as e:
193
+ logger.warning(f"⚠️ Could not load NER model: {e}")
194
+
195
+ def discover_sources_with_ai(
196
+ self,
197
+ project_description: str,
198
+ max_sources: int = 20,
199
+ search_type: str = "general",
200
+ include_academic: bool = True,
201
+ include_news: bool = True
202
+ ) -> Tuple[str, str]:
203
+ """
204
+ 🧠 Discover sources using Perplexity AI
205
+
206
+ Args:
207
+ project_description: Description of the dataset project
208
+ max_sources: Maximum number of sources to find
209
+ search_type: Type of search (general, academic, news, etc.)
210
+ include_academic: Include academic sources
211
+ include_news: Include news sources
212
 
213
+ Returns:
214
+ Tuple of (status_message, sources_json)
215
+ """
216
+ if not self.perplexity_client:
217
+ return "❌ Perplexity AI not available. Please set PERPLEXITY_API_KEY environment variable.", "[]"
218
+
219
+ try:
220
+ logger.info(f"πŸ” Discovering sources for: {project_description}")
221
 
222
+ # Map string to enum
223
+ search_type_enum = getattr(SearchType, search_type.upper(), SearchType.GENERAL)
 
 
224
 
225
+ # Discover sources
226
+ results = self.perplexity_client.discover_sources(
227
+ project_description=project_description,
228
+ search_type=search_type_enum,
229
+ max_sources=max_sources,
230
+ include_academic=include_academic,
231
+ include_news=include_news
 
 
 
232
  )
233
 
234
+ if not results.sources:
235
+ return "⚠️ No sources found. Try adjusting your search terms.", "[]"
236
 
237
+ # Format results for display
238
+ sources_data = []
239
+ for source in results.sources:
240
+ sources_data.append({
241
+ "URL": source.url,
242
+ "Title": source.title,
243
+ "Description": source.description,
244
+ "Type": source.source_type,
245
+ "Domain": source.domain,
246
+ "Quality Score": f"{source.relevance_score:.1f}/10"
247
+ })
 
248
 
249
+ status = f"βœ… Found {len(results.sources)} sources in {results.search_time:.1f}s"
250
+ if results.suggestions:
251
+ status += f"\nπŸ’‘ Suggestions: {', '.join(results.suggestions[:3])}"
252
 
253
+ return status, json.dumps(sources_data, indent=2)
254
+
255
+ except Exception as e:
256
+ logger.error(f"❌ Error discovering sources: {e}")
257
+ return f"❌ Error: {str(e)}", "[]"
258
 
259
+ def extract_urls_from_sources(self, sources_json: str) -> List[str]:
260
+ """Extract URLs from discovered sources JSON"""
261
+ try:
262
+ sources = json.loads(sources_json)
263
+ if isinstance(sources, list):
264
+ return [source.get("URL", "") for source in sources if source.get("URL")]
265
+ return []
266
+ except:
267
+ return []
 
 
268
 
269
+ def create_project(self, name: str, template: str, description: str) -> str:
270
+ """Create a new dataset project"""
271
+ if not name.strip():
272
+ return "❌ Please provide a project name"
273
+
274
+ project_id = f"project_{int(time.time())}"
275
+ self.projects[project_id] = {
276
+ "name": name,
277
+ "template": template,
278
+ "description": description,
279
+ "created_at": datetime.now().isoformat(),
280
+ "urls": [],
281
+ "data": [],
282
+ "processed_data": []
283
+ }
284
 
285
+ self.current_project = project_id
 
 
 
 
 
286
 
287
+ template_info = DATASET_TEMPLATES.get(template, {})
288
+ status = f"βœ… Project '{name}' created successfully!\n"
289
+ status += f"πŸ“‹ Template: {template_info.get('name', template)}\n"
290
+ status += f"πŸ“ Description: {description}\n"
291
+ status += f"πŸ†” Project ID: {project_id}"
292
 
293
+ return status
294
 
295
+ def scrape_urls(self, urls_text: str, progress=gr.Progress()) -> Tuple[str, str]:
296
+ """Scrape content from provided URLs"""
297
+ if not self.current_project:
298
+ return "❌ Please create a project first", ""
 
 
 
299
 
300
+ # Parse URLs
301
+ urls = []
302
+ for line in urls_text.strip().split('\n'):
303
+ url = line.strip()
304
+ if url and self._is_valid_url(url):
305
+ urls.append(url)
306
 
307
+ if not urls:
308
+ return "❌ No valid URLs found", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
+ scraped_data = []
311
+ failed_urls = []
 
 
312
 
313
+ progress(0, desc="Starting scraping...")
 
 
314
 
315
+ for i, url in enumerate(urls):
316
+ try:
317
+ progress((i + 1) / len(urls), desc=f"Scraping {i + 1}/{len(urls)}")
318
+
319
+ logger.info(f"πŸ” Scraping: {url}")
320
+
321
+ # Make request
322
+ headers = {
323
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
324
+ }
325
+
326
+ response = requests.get(url, headers=headers, timeout=10)
327
+ response.raise_for_status()
328
+
329
+ # Parse content
330
+ soup = BeautifulSoup(response.content, 'html.parser')
331
+
332
+ # Extract text content
333
+ title = self._extract_title(soup)
334
+ content = self._extract_content(soup)
335
+
336
+ if content:
337
+ scraped_data.append({
338
+ 'url': url,
339
+ 'title': title,
340
+ 'content': content,
341
+ 'length': len(content),
342
+ 'scraped_at': datetime.now().isoformat()
343
+ })
344
+ logger.info(f"βœ… Scraped {len(content)} characters from {url}")
345
+ else:
346
+ failed_urls.append(url)
347
+ logger.warning(f"⚠️ No content extracted from {url}")
348
+
349
+ # Rate limiting
350
+ time.sleep(0.5)
351
+
352
+ except Exception as e:
353
+ failed_urls.append(url)
354
+ logger.error(f"❌ Failed to scrape {url}: {e}")
355
+
356
+ # Store results
357
+ self.projects[self.current_project]['urls'] = urls
358
+ self.projects[self.current_project]['data'] = scraped_data
359
+ self.scraped_data = scraped_data
360
+
361
+ # Create status message
362
+ status = f"βœ… Scraping completed!\n"
363
+ status += f"πŸ“Š Successfully scraped: {len(scraped_data)} URLs\n"
364
+ status += f"❌ Failed: {len(failed_urls)} URLs\n"
365
+ status += f"πŸ“ Total content: {sum(item['length'] for item in scraped_data):,} characters"
366
+
367
+ if failed_urls:
368
+ status += f"\n\nFailed URLs:\n" + "\n".join(f"β€’ {url}" for url in failed_urls[:5])
369
+ if len(failed_urls) > 5:
370
+ status += f"\n... and {len(failed_urls) - 5} more"
371
+
372
+ # Create preview data
373
+ preview_data = []
374
+ for item in scraped_data[:10]: # Show first 10
375
+ preview_data.append({
376
+ "Title": item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
377
+ "URL": item['url'],
378
+ "Length": f"{item['length']:,} chars",
379
+ "Preview": item['content'][:100] + "..." if len(item['content']) > 100 else item['content']
380
+ })
381
 
382
+ return status, json.dumps(preview_data, indent=2)
 
 
 
 
 
 
 
 
383
 
384
+ def process_data(self, template: str, progress=gr.Progress()) -> Tuple[str, str]:
385
+ """Process scraped data according to template"""
386
+ if not self.scraped_data:
387
+ return "❌ No scraped data available. Please scrape URLs first.", ""
 
388
 
389
+ template_config = DATASET_TEMPLATES.get(template, {})
390
+ if not template_config:
391
+ return f"❌ Unknown template: {template}", ""
 
 
 
 
 
 
 
 
 
392
 
393
+ processed_data = []
394
+
395
+ progress(0, desc="Starting data processing...")
396
+
397
+ for i, item in enumerate(self.scraped_data):
398
  try:
399
+ progress((i + 1) / len(self.scraped_data), desc=f"Processing {i + 1}/{len(self.scraped_data)}")
 
 
400
 
401
+ content = item['content']
 
 
402
 
403
+ # Process based on template
404
+ if template == "sentiment_analysis":
405
+ processed_item = self._process_sentiment_analysis(item)
406
+ elif template == "text_classification":
407
+ processed_item = self._process_text_classification(item)
408
+ elif template == "named_entity_recognition":
409
+ processed_item = self._process_ner(item)
410
+ elif template == "question_answering":
411
+ processed_item = self._process_qa(item)
412
+ elif template == "text_summarization":
413
+ processed_item = self._process_summarization(item)
414
+ elif template == "translation":
415
+ processed_item = self._process_translation(item)
416
+ else:
417
+ processed_item = self._process_generic(item)
418
 
419
+ if processed_item:
420
+ processed_data.extend(processed_item)
 
 
 
421
 
422
  except Exception as e:
423
+ logger.error(f"❌ Error processing item {i}: {e}")
424
  continue
425
 
426
+ # Store processed data
427
+ self.processed_data = processed_data
428
+ if self.current_project:
429
+ self.projects[self.current_project]['processed_data'] = processed_data
430
+
431
+ # Create status
432
+ status = f"βœ… Processing completed!\n"
433
+ status += f"πŸ“Š Generated {len(processed_data)} training examples\n"
434
+ status += f"πŸ“‹ Template: {template_config['name']}\n"
435
+ status += f"🏷️ Fields: {', '.join(template_config['fields'])}"
436
+
437
+ # Create preview
438
+ preview_data = processed_data[:10] if processed_data else []
439
+
440
+ return status, json.dumps(preview_data, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
 
442
+ def _process_sentiment_analysis(self, item: Dict) -> List[Dict]:
443
+ """Process item for sentiment analysis"""
444
+ content = item['content']
445
+
446
+ # Split into sentences for more training examples
447
+ if HAS_NLTK:
448
+ try:
449
+ sentences = sent_tokenize(content)
450
+ except:
451
+ sentences = content.split('. ')
452
+ else:
453
+ sentences = content.split('. ')
454
+
455
+ results = []
456
+
457
+ for sentence in sentences:
458
+ sentence = sentence.strip()
459
+ if len(sentence) < 10 or len(sentence) > 500: # Filter by length
460
+ continue
461
 
462
+ # Use AI model if available
463
+ if self.sentiment_analyzer:
464
+ try:
465
+ prediction = self.sentiment_analyzer(sentence)[0]
466
+ # Map labels
467
+ label_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral'}
468
+ sentiment = label_map.get(prediction[0]['label'], 'neutral')
469
+ confidence = prediction[0]['score']
470
+
471
+ # Only include high-confidence predictions
472
+ if confidence > 0.7:
473
+ results.append({
474
+ 'text': sentence,
475
+ 'sentiment': sentiment,
476
+ 'confidence': confidence,
477
+ 'source_url': item['url']
478
+ })
479
+ except Exception as e:
480
+ logger.debug(f"Sentiment analysis failed: {e}")
481
+ continue
482
  else:
483
+ # Fallback: keyword-based sentiment
484
+ sentiment = self._keyword_sentiment(sentence)
485
+ results.append({
486
+ 'text': sentence,
487
+ 'sentiment': sentiment,
488
+ 'source_url': item['url']
489
+ })
490
+
491
+ return results[:20] # Limit per document
492
+
493
+ def _process_text_classification(self, item: Dict) -> List[Dict]:
494
+ """Process item for text classification"""
495
+ content = item['content']
496
+
497
+ # Extract domain-based category
498
+ url = item['url']
499
+ category = self._extract_category_from_url(url)
500
+
501
+ # Split into paragraphs
502
+ paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50]
503
+
504
+ results = []
505
+ for paragraph in paragraphs[:10]: # Limit per document
506
+ results.append({
507
+ 'text': paragraph,
508
+ 'category': category,
509
+ 'source_url': url
510
+ })
511
+
512
+ return results
513
 
514
+ def _process_ner(self, item: Dict) -> List[Dict]:
515
+ """Process item for Named Entity Recognition"""
516
+ content = item['content']
517
+
518
+ if HAS_NLTK:
519
+ try:
520
+ sentences = sent_tokenize(content)
521
+ except:
522
+ sentences = content.split('. ')
523
+ else:
524
+ sentences = content.split('. ')
525
+
526
+ results = []
527
+
528
+ for sentence in sentences[:20]: # Limit per document
529
+ sentence = sentence.strip()
530
+ if len(sentence) < 20:
531
+ continue
532
+
533
+ entities = []
534
+
535
+ if self.ner_model:
536
+ try:
537
+ ner_results = self.ner_model(sentence)
538
+ for entity in ner_results:
539
+ entities.append({
540
+ 'text': entity['word'],
541
+ 'label': entity['entity_group'],
542
+ 'confidence': entity['score']
543
+ })
544
+ except Exception as e:
545
+ logger.debug(f"NER failed: {e}")
546
 
547
+ # Fallback: simple pattern matching
548
+ if not entities:
549
+ entities = self._simple_ner(sentence)
550
 
551
+ if entities:
552
+ results.append({
553
+ 'text': sentence,
554
+ 'entities': entities,
555
+ 'source_url': item['url']
556
+ })
557
 
558
+ return results
559
 
560
+ def _process_qa(self, item: Dict) -> List[Dict]:
561
+ """Process item for Question Answering"""
562
+ content = item['content']
 
 
 
 
 
 
 
 
563
 
564
+ # Generate simple Q&A pairs based on content
565
+ results = []
 
566
 
567
+ # Look for FAQ-style patterns
568
+ qa_patterns = [
569
+ (r'Q:\s*(.+?)\s*A:\s*(.+?)(?=Q:|$)', 'qa'),
570
+ (r'Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)', 'qa'),
571
+ (r'(.+\?)\s*(.+?)(?=.+\?|$)', 'simple')
572
+ ]
573
 
574
+ for pattern, style in qa_patterns:
575
+ matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
576
+
577
+ for match in matches[:10]: # Limit per document
578
+ if len(match) == 2:
579
+ question = match[0].strip()
580
+ answer = match[1].strip()
581
+
582
+ if len(question) > 10 and len(answer) > 10:
583
+ results.append({
584
+ 'context': content[:500], # First 500 chars as context
585
+ 'question': question,
586
+ 'answer': answer,
587
+ 'source_url': item['url']
588
+ })
589
 
590
+ return results
591
 
592
+ def _process_summarization(self, item: Dict) -> List[Dict]:
593
+ """Process item for summarization"""
594
+ content = item['content']
 
595
 
596
+ # Split into chunks for summarization
597
+ chunk_size = 1000
598
+ chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
599
 
600
+ results = []
 
 
 
 
 
601
 
602
+ for chunk in chunks[:5]: # Limit per document
603
+ if len(chunk) < 100:
604
+ continue
605
+
606
+ summary = ""
607
+
608
+ if self.summarizer and len(chunk) > 100:
609
+ try:
610
+ summary_result = self.summarizer(chunk, max_length=100, min_length=30)
611
+ summary = summary_result[0]['summary_text']
612
+ except Exception as e:
613
+ logger.debug(f"Summarization failed: {e}")
614
+
615
+ # Fallback: extractive summary
616
+ if not summary:
617
+ summary = self._extractive_summary(chunk)
618
+
619
+ if summary:
620
+ results.append({
621
+ 'text': chunk,
622
+ 'summary': summary,
623
+ 'source_url': item['url']
624
+ })
625
 
626
+ return results
627
+
628
+ def _process_translation(self, item: Dict) -> List[Dict]:
629
+ """Process item for translation (placeholder)"""
630
+ # This would require actual translation models
631
+ # For now, return empty to avoid errors
632
+ return []
633
 
634
+ def _process_generic(self, item: Dict) -> List[Dict]:
635
+ """Generic processing for unknown templates"""
636
+ content = item['content']
 
637
 
638
+ # Split into paragraphs
639
+ paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50]
 
640
 
641
+ results = []
642
+ for paragraph in paragraphs[:10]:
643
+ results.append({
644
+ 'text': paragraph,
645
+ 'source_url': item['url']
646
+ })
647
+
648
+ return results
649
 
650
+ def export_dataset(self, format_type: str) -> Tuple[str, str]:
651
+ """Export processed dataset"""
652
+ if not self.processed_data:
653
+ return "❌ No processed data available", ""
 
 
 
 
 
 
 
654
 
655
+ try:
656
+ if format_type == "JSON":
657
+ data = json.dumps(self.processed_data, indent=2)
658
+ filename = f"dataset_{int(time.time())}.json"
659
+
660
+ elif format_type == "CSV":
661
+ df = pd.DataFrame(self.processed_data)
662
+ data = df.to_csv(index=False)
663
+ filename = f"dataset_{int(time.time())}.csv"
664
+
665
+ elif format_type == "HuggingFace Dataset":
666
+ # Format for HuggingFace datasets
667
+ hf_data = {
668
+ "data": self.processed_data,
669
+ "info": {
670
+ "description": "AI Dataset Studio generated dataset",
671
+ "created_at": datetime.now().isoformat(),
672
+ "size": len(self.processed_data)
673
+ }
674
+ }
675
+ data = json.dumps(hf_data, indent=2)
676
+ filename = f"hf_dataset_{int(time.time())}.json"
677
+
678
+ elif format_type == "JSONL":
679
+ lines = [json.dumps(item) for item in self.processed_data]
680
+ data = '\n'.join(lines)
681
+ filename = f"dataset_{int(time.time())}.jsonl"
682
+
683
+ else:
684
+ return "❌ Unsupported format", ""
685
+
686
+ # Save to temporary file for download
687
+ temp_path = f"/tmp/{filename}"
688
+ with open(temp_path, 'w', encoding='utf-8') as f:
689
+ f.write(data)
690
+
691
+ status = f"βœ… Dataset exported successfully!\n"
692
+ status += f"πŸ“Š Records: {len(self.processed_data)}\n"
693
+ status += f"πŸ“ Format: {format_type}\n"
694
+ status += f"πŸ“„ Size: {len(data):,} characters"
695
+
696
+ return status, temp_path
697
+
698
+ except Exception as e:
699
+ logger.error(f"Export failed: {e}")
700
+ return f"❌ Export failed: {str(e)}", ""
701
 
702
+ # Helper methods
703
+ def _is_valid_url(self, url: str) -> bool:
704
+ """Validate URL format"""
705
+ try:
706
+ result = urlparse(url)
707
+ return all([result.scheme, result.netloc])
708
+ except:
709
+ return False
710
+
711
+ def _extract_title(self, soup: BeautifulSoup) -> str:
712
+ """Extract title from HTML"""
713
+ title_tag = soup.find('title')
714
+ if title_tag:
715
+ return title_tag.get_text().strip()
716
 
717
+ h1_tag = soup.find('h1')
718
+ if h1_tag:
719
+ return h1_tag.get_text().strip()
720
 
721
+ return "Untitled"
 
722
 
723
+ def _extract_content(self, soup: BeautifulSoup) -> str:
724
+ """Extract main content from HTML"""
725
+ # Remove script and style elements
726
+ for script in soup(["script", "style", "nav", "footer", "header"]):
727
+ script.decompose()
 
728
 
729
+ # Try to find main content
730
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article'))
731
 
732
+ if main_content:
733
+ text = main_content.get_text()
734
+ else:
735
+ text = soup.get_text()
736
 
737
+ # Clean text
738
+ lines = (line.strip() for line in text.splitlines())
739
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
740
+ text = ' '.join(chunk for chunk in chunks if chunk)
741
 
742
+ return text
 
743
 
744
+ def _keyword_sentiment(self, text: str) -> str:
745
+ """Simple keyword-based sentiment analysis"""
746
+ positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like']
747
+ negative_words = ['bad', 'terrible', 'awful', 'hate', 'dislike', 'horrible', 'worst']
748
+
749
+ text_lower = text.lower()
750
+
751
+ pos_count = sum(1 for word in positive_words if word in text_lower)
752
+ neg_count = sum(1 for word in negative_words if word in text_lower)
753
+
754
+ if pos_count > neg_count:
755
+ return 'positive'
756
+ elif neg_count > pos_count:
757
+ return 'negative'
758
+ else:
759
+ return 'neutral'
760
 
761
+ def _extract_category_from_url(self, url: str) -> str:
762
+ """Extract category based on URL domain/path"""
763
+ domain = urlparse(url).netloc.lower()
764
+
765
+ if any(news in domain for news in ['cnn', 'bbc', 'reuters', 'news']):
766
+ return 'news'
767
+ elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech']):
768
+ return 'technology'
769
+ elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business']):
770
+ return 'business'
771
+ elif any(sport in domain for sport in ['espn', 'sport']):
772
+ return 'sports'
773
+ else:
774
+ return 'general'
 
775
 
776
+ def _simple_ner(self, text: str) -> List[Dict]:
777
+ """Simple pattern-based NER"""
778
+ entities = []
779
+
780
+ # Capitalized words (potential names/places)
781
+ cap_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
782
+
783
+ for word in cap_words:
784
+ if len(word) > 2:
785
+ entities.append({
786
+ 'text': word,
787
+ 'label': 'MISC',
788
+ 'confidence': 0.5
789
+ })
790
+
791
+ return entities[:5] # Limit results
 
 
 
792
 
793
+ def _extractive_summary(self, text: str) -> str:
794
+ """Simple extractive summarization"""
795
+ sentences = text.split('. ')
 
796
 
797
+ if len(sentences) <= 2:
798
+ return text
799
 
800
+ # Take first and last sentences
801
+ summary = f"{sentences[0]}. {sentences[-1]}"
802
 
803
+ return summary
 
804
 
805
  def create_modern_interface():
806
  """Create the modern Gradio interface"""
807
+ logger.info("🎨 Creating modern interface...")
808
 
809
+ # Initialize the studio
810
  studio = DatasetStudio()
811
 
812
+ # Custom CSS for modern look
813
+ custom_css = """
814
+ .gradio-container {
815
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 
 
 
816
  }
817
+
818
+ .main-header {
819
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
820
+ color: white;
821
+ padding: 2rem;
822
+ border-radius: 10px;
823
+ margin-bottom: 2rem;
824
+ text-align: center;
825
  }
826
+
827
  .step-header {
828
+ background: linear-gradient(90deg, #4facfe 0%, #00f2fe 100%);
829
+ color: white;
830
+ padding: 1rem;
831
+ border-radius: 8px;
832
+ margin: 1rem 0;
833
+ font-weight: bold;
834
  }
 
835
 
836
+ .template-card {
837
+ border: 2px solid #e1e5e9;
838
+ border-radius: 10px;
839
+ padding: 1rem;
840
+ margin: 0.5rem;
841
+ transition: all 0.3s ease;
842
+ }
843
 
844
+ .template-card:hover {
845
+ border-color: #4facfe;
846
+ box-shadow: 0 4px 12px rgba(79, 172, 254, 0.3);
847
+ }
848
+
849
+ .status-success {
850
+ background-color: #d4edda;
851
+ border-color: #c3e6cb;
852
+ color: #155724;
853
+ padding: 1rem;
854
+ border-radius: 5px;
855
+ border-left: 4px solid #28a745;
856
+ }
857
+
858
+ .status-error {
859
+ background-color: #f8d7da;
860
+ border-color: #f5c6cb;
861
+ color: #721c24;
862
+ padding: 1rem;
863
+ border-radius: 5px;
864
+ border-left: 4px solid #dc3545;
865
+ }
866
+ """
867
+
868
+ with gr.Blocks(css=custom_css, title="πŸš€ AI Dataset Studio", theme=gr.themes.Soft()) as interface:
869
+ # Main header
870
  gr.HTML("""
871
+ <div class="main-header">
872
  <h1>πŸš€ AI Dataset Studio</h1>
873
+ <p>Create high-quality training datasets with AI-powered source discovery</p>
874
+ <p><strong>🧠 Powered by Perplexity AI β€’ πŸ€– Advanced NLP β€’ πŸ“Š Professional Export</strong></p>
875
  </div>
876
  """)
877
 
878
+ with gr.Tabs() as tabs:
879
+ # Tab 1: Project Setup
880
+ with gr.TabItem("1️⃣ Project Setup", id=0):
881
+ gr.HTML('<div class="step-header">πŸ“‹ Step 1: Create Your Dataset Project</div>')
 
882
 
883
  with gr.Row():
884
  with gr.Column(scale=2):
885
  project_name = gr.Textbox(
886
+ label="🏷️ Project Name",
887
+ placeholder="e.g., Customer Review Sentiment Analysis",
888
+ info="Give your dataset project a descriptive name"
889
  )
890
 
891
+ project_description = gr.Textbox(
892
+ label="πŸ“ Project Description",
893
+ lines=3,
894
+ placeholder="Describe what kind of dataset you want to create...",
895
+ info="This will be used by AI to discover relevant sources"
 
 
 
 
 
896
  )
 
 
 
897
 
898
  with gr.Column(scale=1):
899
+ # Template selection
900
+ template_choices = list(DATASET_TEMPLATES.keys())
901
+ template_labels = [DATASET_TEMPLATES[t]["name"] for t in template_choices]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
+ template_selector = gr.Dropdown(
904
+ choices=list(zip(template_labels, template_choices)),
905
+ label="πŸ“Š Dataset Template",
906
+ value=(template_labels[0], template_choices[0]),
907
+ info="Choose the type of ML task"
 
 
 
 
 
908
  )
909
 
910
+ # Template info
911
+ template_info = gr.Markdown("Select a template to see details")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
 
913
+ create_project_btn = gr.Button("🎯 Create Project", variant="primary", size="lg")
914
+ project_status = gr.Textbox(label="πŸ“Š Project Status", interactive=False)
915
 
916
+ # Update template info when selection changes
917
+ def update_template_info(template_choice):
918
+ if template_choice and len(template_choice) > 1:
919
+ template_key = template_choice[1]
920
+ template = DATASET_TEMPLATES.get(template_key, {})
921
+ info = f"**{template.get('name', '')}**\n\n"
922
+ info += f"πŸ“– {template.get('description', '')}\n\n"
923
+ info += f"🏷️ **Fields:** {', '.join(template.get('fields', []))}\n\n"
924
+ info += f"πŸ’‘ **Example:** `{template.get('example', {})}`"
925
+ return info
926
+ return "Select a template to see details"
927
 
928
+ template_selector.change(
929
+ fn=update_template_info,
930
+ inputs=[template_selector],
931
+ outputs=[template_info]
932
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
 
934
+ # Tab 2: AI Source Discovery
935
+ with gr.TabItem("2️⃣ AI Source Discovery", id=1):
936
+ gr.HTML('<div class="step-header">🧠 Step 2: Discover Sources with Perplexity AI</div>')
 
 
 
 
 
 
 
 
 
 
 
937
 
938
+ if HAS_PERPLEXITY:
939
+ gr.Markdown("""
940
+ ✨ **AI-Powered Source Discovery** - Let Perplexity AI find the best sources for your dataset!
941
+
942
+ Just describe your project and AI will discover relevant, high-quality sources automatically.
943
+ """)
944
+
945
+ with gr.Row():
946
+ with gr.Column():
947
+ ai_search_description = gr.Textbox(
948
+ label="🎯 Project Description for AI Search",
949
+ lines=3,
950
+ placeholder="e.g., I need product reviews for sentiment analysis training data...",
951
+ info="Describe what sources you need - be specific!"
952
+ )
953
+
954
+ with gr.Row():
955
+ search_type = gr.Dropdown(
956
+ choices=["general", "academic", "news", "technical"],
957
+ value="general",
958
+ label="πŸ” Search Type"
959
+ )
960
+
961
+ max_sources = gr.Slider(
962
+ minimum=5,
963
+ maximum=50,
964
+ value=20,
965
+ step=5,
966
+ label="πŸ“Š Max Sources"
967
+ )
968
+
969
+ with gr.Row():
970
+ include_academic = gr.Checkbox(label="πŸ“š Include Academic Sources", value=True)
971
+ include_news = gr.Checkbox(label="πŸ“° Include News Sources", value=True)
972
+
973
+ discover_btn = gr.Button("🧠 Discover Sources with AI", variant="primary", size="lg")
974
+
975
+ ai_search_status = gr.Textbox(label="πŸ” Discovery Status", interactive=False)
976
+ discovered_sources = gr.Code(label="πŸ“‹ Discovered Sources", language="json", interactive=False)
977
+
978
+ # Use discovered sources button
979
+ use_ai_sources_btn = gr.Button("βœ… Use These Sources", variant="secondary")
980
+
981
+ else:
982
+ gr.Markdown("""
983
+ ⚠️ **Perplexity AI Not Available**
984
+
985
+ To enable AI-powered source discovery, set your `PERPLEXITY_API_KEY` environment variable.
986
+ For now, you can manually enter URLs below.
987
+ """)
988
+
989
+ discovered_sources = gr.Code(value="[]", visible=False)
990
 
991
+ gr.HTML('<div class="step-header">πŸ“ Manual URL Entry</div>')
 
992
 
993
+ urls_input = gr.Textbox(
994
+ label="πŸ”— URLs to Scrape",
995
+ lines=10,
996
+ placeholder="https://example.com/article1\nhttps://example.com/article2\n...",
997
+ info="Enter one URL per line"
998
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
+ scrape_btn = gr.Button("πŸ•·οΈ Start Scraping", variant="primary", size="lg")
1001
+ scrape_status = gr.Textbox(label="πŸ“Š Scraping Status", interactive=False)
1002
+ scraped_preview = gr.Code(label="πŸ‘€ Scraped Data Preview", language="json", interactive=False)
 
 
 
 
 
 
 
1003
 
1004
+ # Tab 3: Data Processing
1005
+ with gr.TabItem("3️⃣ Data Processing", id=2):
1006
+ gr.HTML('<div class="step-header">βš™οΈ Step 3: Process Data with AI</div>')
1007
 
1008
+ processing_template = gr.Dropdown(
1009
+ choices=list(zip(template_labels, template_choices)),
1010
+ label="πŸ“Š Processing Template",
1011
+ value=(template_labels[0], template_choices[0]),
1012
+ info="How should the data be processed?"
1013
+ )
1014
 
1015
+ process_btn = gr.Button("βš™οΈ Process Data", variant="primary", size="lg")
1016
+ process_status = gr.Textbox(label="πŸ“Š Processing Status", interactive=False)
1017
+ processed_preview = gr.Code(label="🎯 Processed Data Preview", language="json", interactive=False)
1018
+
1019
+ # Tab 4: Export Dataset
1020
+ with gr.TabItem("4️⃣ Export Dataset", id=3):
1021
+ gr.HTML('<div class="step-header">πŸ“¦ Step 4: Export Your Dataset</div>')
1022
 
1023
+ export_format = gr.Dropdown(
1024
+ choices=["JSON", "CSV", "HuggingFace Dataset", "JSONL"],
1025
+ value="JSON",
1026
+ label="πŸ“„ Export Format",
1027
+ info="Choose format for your dataset"
1028
+ )
1029
 
1030
+ export_btn = gr.Button("πŸ“¦ Export Dataset", variant="primary", size="lg")
1031
+ export_status = gr.Textbox(label="πŸ“Š Export Status", interactive=False)
1032
+ download_file = gr.File(label="πŸ’Ύ Download Dataset", interactive=False)
1033
 
1034
+ # Event handlers
1035
  create_project_btn.click(
1036
+ fn=lambda name, desc, template: studio.create_project(name, template[1] if template else "", desc),
1037
+ inputs=[project_name, project_description, template_selector],
1038
+ outputs=[project_status]
1039
  )
1040
 
1041
+ if HAS_PERPLEXITY:
1042
+ discover_btn.click(
1043
+ fn=studio.discover_sources_with_ai,
1044
+ inputs=[ai_search_description, max_sources, search_type, include_academic, include_news],
1045
+ outputs=[ai_search_status, discovered_sources]
1046
+ )
1047
+
1048
+ use_ai_sources_btn.click(
1049
+ fn=lambda sources_json: '\n'.join(studio.extract_urls_from_sources(sources_json)),
1050
+ inputs=[discovered_sources],
1051
+ outputs=[urls_input]
1052
+ )
1053
+
1054
  scrape_btn.click(
1055
+ fn=studio.scrape_urls,
1056
+ inputs=[urls_input],
1057
+ outputs=[scrape_status, scraped_preview]
1058
  )
1059
 
1060
  process_btn.click(
1061
+ fn=lambda template: studio.process_data(template[1] if template else ""),
1062
+ inputs=[processing_template],
1063
+ outputs=[process_status, processed_preview]
 
 
 
 
 
 
 
1064
  )
1065
 
1066
  export_btn.click(
1067
+ fn=studio.export_dataset,
1068
+ inputs=[export_format],
1069
+ outputs=[export_status, download_file]
1070
  )
1071
 
1072
+ logger.info("βœ… Interface created successfully")
1073
  return interface
1074
 
1075
+ # Application startup
1076
+ try:
1077
  logger.info("πŸš€ Starting AI Dataset Studio...")
1078
+ logger.info("πŸ“Š Features: βœ… AI Models | βœ… Advanced NLP | βœ… HuggingFace Integration")
1079
 
1080
+ interface = create_modern_interface()
 
 
 
 
 
 
 
 
 
 
1081
 
1082
+ logger.info("βœ… Application startup successful")
 
 
 
1083
 
1084
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
1085
  interface.launch(
1086
  server_name="0.0.0.0",
1087
  server_port=7860,
1088
  share=False,
1089
  show_error=True
1090
  )
1091
+
1092
+ except Exception as e:
1093
+ logger.error(f"❌ Failed to launch application: {e}")
1094
+ logger.error(f"Traceback: {traceback.format_exc()}")
1095
+ sys.exit(1)