|
""" |
|
AI Dataset Studio - Modern Web Scraping & Dataset Creation Platform |
|
A mini Scale AI for non-coders and vibe coders |
|
|
|
Features: |
|
- Intelligent web scraping with content extraction |
|
- Automated data cleaning and preprocessing |
|
- Interactive annotation tools |
|
- Template-based workflows for common ML tasks |
|
- High-quality dataset generation |
|
- Export to HuggingFace Hub and popular ML formats |
|
- Visual data quality metrics |
|
- No-code dataset creation workflows |
|
""" |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
import json |
|
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, urljoin |
|
from datetime import datetime, timedelta |
|
import logging |
|
from typing import Dict, List, Tuple, Optional, Any |
|
from dataclasses import dataclass, asdict |
|
from pathlib import Path |
|
import uuid |
|
import hashlib |
|
import time |
|
from collections import defaultdict |
|
import io |
|
import zipfile |
|
|
|
|
|
try: |
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
from sentence_transformers import SentenceTransformer |
|
HAS_TRANSFORMERS = True |
|
except ImportError: |
|
HAS_TRANSFORMERS = False |
|
|
|
try: |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from nltk.corpus import stopwords |
|
HAS_NLTK = True |
|
except ImportError: |
|
HAS_NLTK = False |
|
|
|
try: |
|
from datasets import Dataset, DatasetDict |
|
HAS_DATASETS = True |
|
except ImportError: |
|
HAS_DATASETS = False |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
if HAS_NLTK: |
|
try: |
|
nltk.download('punkt', quiet=True) |
|
nltk.download('stopwords', quiet=True) |
|
nltk.download('averaged_perceptron_tagger', quiet=True) |
|
except: |
|
pass |
|
|
|
@dataclass |
|
class ScrapedItem: |
|
"""Data class for scraped content""" |
|
id: str |
|
url: str |
|
title: str |
|
content: str |
|
metadata: Dict[str, Any] |
|
scraped_at: str |
|
word_count: int |
|
language: str = "en" |
|
quality_score: float = 0.0 |
|
labels: List[str] = None |
|
annotations: Dict[str, Any] = None |
|
|
|
def __post_init__(self): |
|
if self.labels is None: |
|
self.labels = [] |
|
if self.annotations is None: |
|
self.annotations = {} |
|
|
|
@dataclass |
|
class DatasetTemplate: |
|
"""Template for dataset creation""" |
|
name: str |
|
description: str |
|
task_type: str |
|
required_fields: List[str] |
|
optional_fields: List[str] |
|
example_format: Dict[str, Any] |
|
instructions: str |
|
|
|
class WebScraperEngine: |
|
"""Advanced web scraping engine with smart content extraction""" |
|
|
|
def __init__(self): |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0; Research)', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Accept-Encoding': 'gzip, deflate', |
|
'Connection': 'keep-alive', |
|
}) |
|
|
|
|
|
self.content_classifier = None |
|
self.quality_scorer = None |
|
self._load_models() |
|
|
|
def _load_models(self): |
|
"""Load AI models for content analysis""" |
|
if not HAS_TRANSFORMERS: |
|
logger.warning("⚠️ Transformers not available, using rule-based methods") |
|
return |
|
|
|
try: |
|
|
|
self.quality_scorer = pipeline( |
|
"text-classification", |
|
model="martin-ha/toxic-comment-model", |
|
return_all_scores=True |
|
) |
|
logger.info("✅ Quality assessment model loaded") |
|
except Exception as e: |
|
logger.warning(f"⚠️ Could not load quality model: {e}") |
|
|
|
def scrape_url(self, url: str) -> Optional[ScrapedItem]: |
|
"""Scrape a single URL and return structured data""" |
|
try: |
|
|
|
if not self._is_valid_url(url): |
|
raise ValueError("Invalid URL provided") |
|
|
|
|
|
response = self.session.get(url, timeout=15) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
title = self._extract_title(soup) |
|
content = self._extract_content(soup) |
|
metadata = self._extract_metadata(soup, response) |
|
|
|
|
|
item = ScrapedItem( |
|
id=str(uuid.uuid4()), |
|
url=url, |
|
title=title, |
|
content=content, |
|
metadata=metadata, |
|
scraped_at=datetime.now().isoformat(), |
|
word_count=len(content.split()), |
|
quality_score=self._assess_quality(content) |
|
) |
|
|
|
return item |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to scrape {url}: {e}") |
|
return None |
|
|
|
def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]: |
|
"""Scrape multiple URLs with progress tracking""" |
|
results = [] |
|
total = len(urls) |
|
|
|
for i, url in enumerate(urls): |
|
if progress_callback: |
|
progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...") |
|
|
|
item = self.scrape_url(url) |
|
if item: |
|
results.append(item) |
|
|
|
|
|
time.sleep(1) |
|
|
|
return results |
|
|
|
def _is_valid_url(self, url: str) -> bool: |
|
"""Validate URL format and safety""" |
|
try: |
|
parsed = urlparse(url) |
|
return parsed.scheme in ['http', 'https'] and parsed.netloc |
|
except: |
|
return False |
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> str: |
|
"""Extract page title""" |
|
|
|
selectors = [ |
|
'meta[property="og:title"]', |
|
'meta[name="twitter:title"]', |
|
'title', |
|
'h1' |
|
] |
|
|
|
for selector in selectors: |
|
element = soup.select_one(selector) |
|
if element: |
|
if element.name == 'meta': |
|
return element.get('content', '').strip() |
|
else: |
|
return element.get_text().strip() |
|
|
|
return "Untitled" |
|
|
|
def _extract_content(self, soup: BeautifulSoup) -> str: |
|
"""Extract main content using multiple strategies""" |
|
|
|
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): |
|
element.decompose() |
|
|
|
|
|
content_selectors = [ |
|
'article', |
|
'main', |
|
'.content', |
|
'.post-content', |
|
'.entry-content', |
|
'.article-body', |
|
'[role="main"]' |
|
] |
|
|
|
for selector in content_selectors: |
|
element = soup.select_one(selector) |
|
if element: |
|
text = element.get_text(separator=' ', strip=True) |
|
if len(text) > 200: |
|
return self._clean_text(text) |
|
|
|
|
|
body = soup.find('body') |
|
if body: |
|
return self._clean_text(body.get_text(separator=' ', strip=True)) |
|
|
|
return self._clean_text(soup.get_text(separator=' ', strip=True)) |
|
|
|
def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]: |
|
"""Extract metadata from page""" |
|
metadata = { |
|
'domain': urlparse(response.url).netloc, |
|
'status_code': response.status_code, |
|
'content_type': response.headers.get('content-type', ''), |
|
'extracted_at': datetime.now().isoformat() |
|
} |
|
|
|
|
|
meta_tags = ['description', 'keywords', 'author', 'published_time'] |
|
for tag in meta_tags: |
|
element = soup.find('meta', attrs={'name': tag}) or soup.find('meta', attrs={'property': f'article:{tag}'}) |
|
if element: |
|
metadata[tag] = element.get('content', '') |
|
|
|
return metadata |
|
|
|
def _clean_text(self, text: str) -> str: |
|
"""Clean extracted text""" |
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
patterns = [ |
|
r'Subscribe.*?newsletter', |
|
r'Click here.*?more', |
|
r'Advertisement', |
|
r'Share this.*?social', |
|
r'Follow us on.*?media' |
|
] |
|
|
|
for pattern in patterns: |
|
text = re.sub(pattern, '', text, flags=re.IGNORECASE) |
|
|
|
return text.strip() |
|
|
|
def _assess_quality(self, content: str) -> float: |
|
"""Assess content quality (0-1 score)""" |
|
if not content: |
|
return 0.0 |
|
|
|
score = 0.0 |
|
|
|
|
|
word_count = len(content.split()) |
|
if word_count >= 50: |
|
score += 0.3 |
|
elif word_count >= 20: |
|
score += 0.1 |
|
|
|
|
|
sentence_count = len(re.split(r'[.!?]+', content)) |
|
if sentence_count >= 3: |
|
score += 0.2 |
|
|
|
|
|
if re.search(r'[A-Z][a-z]+', content): |
|
score += 0.2 |
|
|
|
if not re.search(r'[^\w\s]', content[:100]): |
|
score += 0.1 |
|
|
|
|
|
avg_word_length = np.mean([len(word) for word in content.split()]) |
|
if 3 <= avg_word_length <= 8: |
|
score += 0.2 |
|
|
|
return min(score, 1.0) |
|
|
|
class DataProcessor: |
|
"""Advanced data processing and cleaning pipeline""" |
|
|
|
def __init__(self): |
|
self.language_detector = None |
|
self.sentiment_analyzer = None |
|
self.ner_model = None |
|
self._load_models() |
|
|
|
def _load_models(self): |
|
"""Load NLP models for processing""" |
|
if not HAS_TRANSFORMERS: |
|
return |
|
|
|
try: |
|
|
|
self.sentiment_analyzer = pipeline( |
|
"sentiment-analysis", |
|
model="cardiffnlp/twitter-roberta-base-sentiment-latest" |
|
) |
|
|
|
|
|
self.ner_model = pipeline( |
|
"ner", |
|
model="dbmdz/bert-large-cased-finetuned-conll03-english", |
|
aggregation_strategy="simple" |
|
) |
|
|
|
logger.info("✅ NLP models loaded successfully") |
|
except Exception as e: |
|
logger.warning(f"⚠️ Could not load NLP models: {e}") |
|
|
|
def process_items(self, items: List[ScrapedItem], processing_options: Dict[str, bool]) -> List[ScrapedItem]: |
|
"""Process scraped items with various enhancement options""" |
|
processed_items = [] |
|
|
|
for item in items: |
|
processed_item = self._process_single_item(item, processing_options) |
|
if processed_item: |
|
processed_items.append(processed_item) |
|
|
|
return processed_items |
|
|
|
def _process_single_item(self, item: ScrapedItem, options: Dict[str, bool]) -> Optional[ScrapedItem]: |
|
"""Process a single item""" |
|
try: |
|
|
|
if options.get('clean_text', True): |
|
item.content = self._clean_text_advanced(item.content) |
|
|
|
|
|
if options.get('quality_filter', True) and item.quality_score < 0.3: |
|
return None |
|
|
|
|
|
if options.get('add_sentiment', False) and self.sentiment_analyzer: |
|
sentiment = self._analyze_sentiment(item.content) |
|
item.metadata['sentiment'] = sentiment |
|
|
|
|
|
if options.get('extract_entities', False) and self.ner_model: |
|
entities = self._extract_entities(item.content) |
|
item.metadata['entities'] = entities |
|
|
|
|
|
if options.get('detect_language', True): |
|
item.language = self._detect_language(item.content) |
|
|
|
return item |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing item {item.id}: {e}") |
|
return None |
|
|
|
def _clean_text_advanced(self, text: str) -> str: |
|
"""Advanced text cleaning""" |
|
|
|
text = re.sub(r'http\S+|www\.\S+', '', text) |
|
|
|
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
text = re.sub(r'[!?]{2,}', '!', text) |
|
text = re.sub(r'\.{3,}', '...', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
paragraphs = text.split('\n') |
|
paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 20] |
|
|
|
return '\n'.join(paragraphs).strip() |
|
|
|
def _analyze_sentiment(self, text: str) -> Dict[str, Any]: |
|
"""Analyze sentiment of text""" |
|
try: |
|
|
|
text_sample = text[:512] |
|
result = self.sentiment_analyzer(text_sample)[0] |
|
return { |
|
'label': result['label'], |
|
'score': result['score'] |
|
} |
|
except: |
|
return {'label': 'UNKNOWN', 'score': 0.0} |
|
|
|
def _extract_entities(self, text: str) -> List[Dict[str, Any]]: |
|
"""Extract named entities""" |
|
try: |
|
|
|
text_sample = text[:512] |
|
entities = self.ner_model(text_sample) |
|
return [ |
|
{ |
|
'text': ent['word'], |
|
'label': ent['entity_group'], |
|
'confidence': ent['score'] |
|
} |
|
for ent in entities |
|
] |
|
except: |
|
return [] |
|
|
|
def _detect_language(self, text: str) -> str: |
|
"""Simple language detection""" |
|
|
|
if re.search(r'[а-яё]', text.lower()): |
|
return 'ru' |
|
elif re.search(r'[ñáéíóúü]', text.lower()): |
|
return 'es' |
|
elif re.search(r'[àâäçéèêëïîôöùûüÿ]', text.lower()): |
|
return 'fr' |
|
else: |
|
return 'en' |
|
|
|
class AnnotationEngine: |
|
"""Interactive annotation tools for dataset creation""" |
|
|
|
def __init__(self): |
|
self.templates = self._load_templates() |
|
|
|
def _load_templates(self) -> Dict[str, DatasetTemplate]: |
|
"""Load predefined dataset templates""" |
|
templates = { |
|
'text_classification': DatasetTemplate( |
|
name="Text Classification", |
|
description="Classify text into predefined categories", |
|
task_type="classification", |
|
required_fields=["text", "label"], |
|
optional_fields=["confidence", "metadata"], |
|
example_format={"text": "Sample text", "label": "positive"}, |
|
instructions="Label each text with the appropriate category" |
|
), |
|
'sentiment_analysis': DatasetTemplate( |
|
name="Sentiment Analysis", |
|
description="Analyze emotional tone of text", |
|
task_type="classification", |
|
required_fields=["text", "sentiment"], |
|
optional_fields=["confidence", "aspects"], |
|
example_format={"text": "I love this!", "sentiment": "positive"}, |
|
instructions="Classify the sentiment as positive, negative, or neutral" |
|
), |
|
'named_entity_recognition': DatasetTemplate( |
|
name="Named Entity Recognition", |
|
description="Identify and classify named entities in text", |
|
task_type="ner", |
|
required_fields=["text", "entities"], |
|
optional_fields=["metadata"], |
|
example_format={ |
|
"text": "John works at OpenAI in San Francisco", |
|
"entities": [ |
|
{"text": "John", "label": "PERSON", "start": 0, "end": 4}, |
|
{"text": "OpenAI", "label": "ORG", "start": 14, "end": 20} |
|
] |
|
}, |
|
instructions="Mark all named entities (people, organizations, locations, etc.)" |
|
), |
|
'question_answering': DatasetTemplate( |
|
name="Question Answering", |
|
description="Create question-answer pairs from text", |
|
task_type="qa", |
|
required_fields=["context", "question", "answer"], |
|
optional_fields=["answer_start", "metadata"], |
|
example_format={ |
|
"context": "The capital of France is Paris.", |
|
"question": "What is the capital of France?", |
|
"answer": "Paris" |
|
}, |
|
instructions="Create meaningful questions and provide accurate answers" |
|
), |
|
'summarization': DatasetTemplate( |
|
name="Text Summarization", |
|
description="Create concise summaries of longer texts", |
|
task_type="summarization", |
|
required_fields=["text", "summary"], |
|
optional_fields=["summary_type", "length"], |
|
example_format={ |
|
"text": "Long article text...", |
|
"summary": "Brief summary of the main points" |
|
}, |
|
instructions="Write clear, concise summaries capturing key information" |
|
) |
|
} |
|
return templates |
|
|
|
def create_annotation_interface(self, template_name: str, items: List[ScrapedItem]) -> Dict[str, Any]: |
|
"""Create annotation interface for specific template""" |
|
template = self.templates.get(template_name) |
|
if not template: |
|
raise ValueError(f"Unknown template: {template_name}") |
|
|
|
|
|
annotation_data = [] |
|
for item in items: |
|
annotation_data.append({ |
|
'id': item.id, |
|
'text': item.content[:1000], |
|
'title': item.title, |
|
'url': item.url, |
|
'annotations': {} |
|
}) |
|
|
|
return { |
|
'template': template, |
|
'data': annotation_data, |
|
'progress': 0, |
|
'completed': 0 |
|
} |
|
|
|
class DatasetExporter: |
|
"""Export datasets in various formats for ML frameworks""" |
|
|
|
def __init__(self): |
|
self.supported_formats = [ |
|
'huggingface_datasets', |
|
'json', |
|
'csv', |
|
'parquet', |
|
'jsonl', |
|
'pytorch', |
|
'tensorflow' |
|
] |
|
|
|
def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate, |
|
export_format: str, annotations: Dict[str, Any] = None) -> str: |
|
"""Export annotated dataset in specified format""" |
|
try: |
|
|
|
dataset_data = self._prepare_dataset_data(items, template, annotations) |
|
|
|
|
|
if export_format == 'huggingface_datasets': |
|
return self._export_huggingface(dataset_data, template) |
|
elif export_format == 'json': |
|
return self._export_json(dataset_data) |
|
elif export_format == 'csv': |
|
return self._export_csv(dataset_data) |
|
elif export_format == 'jsonl': |
|
return self._export_jsonl(dataset_data) |
|
else: |
|
raise ValueError(f"Unsupported format: {export_format}") |
|
|
|
except Exception as e: |
|
logger.error(f"Export failed: {e}") |
|
raise |
|
|
|
def _prepare_dataset_data(self, items: List[ScrapedItem], template: DatasetTemplate, |
|
annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]: |
|
"""Prepare data according to template format""" |
|
dataset_data = [] |
|
|
|
for item in items: |
|
|
|
data_point = { |
|
'text': item.content, |
|
'title': item.title, |
|
'url': item.url, |
|
'metadata': item.metadata |
|
} |
|
|
|
|
|
if annotations and item.id in annotations: |
|
item_annotations = annotations[item.id] |
|
data_point.update(item_annotations) |
|
|
|
|
|
formatted_point = self._format_for_template(data_point, template) |
|
if formatted_point: |
|
dataset_data.append(formatted_point) |
|
|
|
return dataset_data |
|
|
|
def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]: |
|
"""Format data point according to template requirements""" |
|
formatted = {} |
|
|
|
|
|
for field in template.required_fields: |
|
if field in data_point: |
|
formatted[field] = data_point[field] |
|
elif field == 'text' and 'content' in data_point: |
|
formatted[field] = data_point['content'] |
|
else: |
|
|
|
return None |
|
|
|
|
|
for field in template.optional_fields: |
|
if field in data_point: |
|
formatted[field] = data_point[field] |
|
|
|
return formatted |
|
|
|
def _export_huggingface(self, dataset_data: List[Dict[str, Any]], template: DatasetTemplate) -> str: |
|
"""Export as HuggingFace Dataset""" |
|
if not HAS_DATASETS: |
|
raise ImportError("datasets library not available") |
|
|
|
try: |
|
|
|
dataset = Dataset.from_list(dataset_data) |
|
|
|
|
|
card_content = f""" |
|
# {template.name} Dataset |
|
|
|
## Description |
|
{template.description} |
|
|
|
## Task Type |
|
{template.task_type} |
|
|
|
## Format |
|
{template.example_format} |
|
|
|
## Instructions |
|
{template.instructions} |
|
|
|
## Statistics |
|
- Total samples: {len(dataset_data)} |
|
- Created: {datetime.now().isoformat()} |
|
|
|
## Usage |
|
```python |
|
from datasets import load_dataset |
|
dataset = load_dataset('path/to/dataset') |
|
``` |
|
""" |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}" |
|
|
|
|
|
dataset.save_to_disk(dataset_name) |
|
|
|
|
|
with open(f"{dataset_name}/README.md", "w") as f: |
|
f.write(card_content) |
|
|
|
return dataset_name |
|
|
|
except Exception as e: |
|
logger.error(f"HuggingFace export failed: {e}") |
|
raise |
|
|
|
def _export_json(self, dataset_data: List[Dict[str, Any]]) -> str: |
|
"""Export as JSON file""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.json" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(dataset_data, f, indent=2, ensure_ascii=False) |
|
|
|
return filename |
|
|
|
def _export_csv(self, dataset_data: List[Dict[str, Any]]) -> str: |
|
"""Export as CSV file""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.csv" |
|
|
|
df = pd.DataFrame(dataset_data) |
|
df.to_csv(filename, index=False) |
|
|
|
return filename |
|
|
|
def _export_jsonl(self, dataset_data: List[Dict[str, Any]]) -> str: |
|
"""Export as JSONL file""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.jsonl" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
for item in dataset_data: |
|
f.write(json.dumps(item, ensure_ascii=False) + '\n') |
|
|
|
return filename |
|
|
|
def create_modern_interface(): |
|
"""Create modern, intuitive interface for AI Dataset Studio""" |
|
|
|
|
|
studio = DatasetStudio() |
|
|
|
|
|
custom_css = """ |
|
.gradio-container { |
|
max-width: 1400px; |
|
margin: auto; |
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
} |
|
|
|
.studio-header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 2rem; |
|
border-radius: 15px; |
|
margin-bottom: 2rem; |
|
text-align: center; |
|
box-shadow: 0 8px 32px rgba(0,0,0,0.1); |
|
} |
|
|
|
.workflow-card { |
|
background: #f8f9ff; |
|
border: 2px solid #e1e5ff; |
|
border-radius: 12px; |
|
padding: 1.5rem; |
|
margin: 1rem 0; |
|
transition: all 0.3s ease; |
|
} |
|
|
|
.workflow-card:hover { |
|
border-color: #667eea; |
|
box-shadow: 0 4px 20px rgba(102, 126, 234, 0.1); |
|
} |
|
|
|
.step-header { |
|
display: flex; |
|
align-items: center; |
|
margin-bottom: 1rem; |
|
font-size: 1.2em; |
|
font-weight: 600; |
|
color: #4c51bf; |
|
} |
|
|
|
.step-number { |
|
background: #667eea; |
|
color: white; |
|
border-radius: 50%; |
|
width: 30px; |
|
height: 30px; |
|
display: flex; |
|
align-items: center; |
|
justify-content: center; |
|
margin-right: 1rem; |
|
font-weight: bold; |
|
} |
|
|
|
.feature-grid { |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); |
|
gap: 1rem; |
|
margin: 1rem 0; |
|
} |
|
|
|
.feature-item { |
|
background: white; |
|
border: 1px solid #e2e8f0; |
|
border-radius: 8px; |
|
padding: 1rem; |
|
text-align: center; |
|
} |
|
|
|
.stat-card { |
|
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); |
|
color: white; |
|
padding: 1rem; |
|
border-radius: 10px; |
|
text-align: center; |
|
margin: 0.5rem; |
|
} |
|
|
|
.progress-bar { |
|
background: #e2e8f0; |
|
border-radius: 10px; |
|
height: 8px; |
|
overflow: hidden; |
|
} |
|
|
|
.progress-fill { |
|
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); |
|
height: 100%; |
|
transition: width 0.3s ease; |
|
} |
|
|
|
.template-card { |
|
border: 2px solid #e2e8f0; |
|
border-radius: 10px; |
|
padding: 1rem; |
|
margin: 0.5rem; |
|
cursor: pointer; |
|
transition: all 0.3s ease; |
|
} |
|
|
|
.template-card:hover { |
|
border-color: #667eea; |
|
transform: translateY(-2px); |
|
box-shadow: 0 4px 12px rgba(0,0,0,0.1); |
|
} |
|
|
|
.template-selected { |
|
border-color: #667eea; |
|
background: #f7fafc; |
|
} |
|
|
|
.export-option { |
|
background: #f7fafc; |
|
border: 1px solid #e2e8f0; |
|
border-radius: 8px; |
|
padding: 1rem; |
|
margin: 0.5rem 0; |
|
cursor: pointer; |
|
} |
|
|
|
.export-option:hover { |
|
background: #edf2f7; |
|
border-color: #cbd5e0; |
|
} |
|
|
|
.success-message { |
|
background: #f0fff4; |
|
border: 1px solid #9ae6b4; |
|
color: #276749; |
|
padding: 1rem; |
|
border-radius: 8px; |
|
margin: 1rem 0; |
|
} |
|
|
|
.error-message { |
|
background: #fed7d7; |
|
border: 1px solid #feb2b2; |
|
color: #c53030; |
|
padding: 1rem; |
|
border-radius: 8px; |
|
margin: 1rem 0; |
|
} |
|
""" |
|
|
|
|
|
project_state = gr.State({}) |
|
|
|
with gr.Blocks(css=custom_css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface: |
|
|
|
|
|
gr.HTML(""" |
|
<div class="studio-header"> |
|
<h1>🚀 AI Dataset Studio</h1> |
|
<p>Create high-quality training datasets without coding - Your personal Scale AI</p> |
|
<p style="opacity: 0.9; font-size: 0.9em;">Web Scraping → Data Processing → Annotation → ML-Ready Datasets</p> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Tabs() as main_tabs: |
|
|
|
|
|
with gr.Tab("🎯 Project Setup", id="setup"): |
|
gr.HTML('<div class="step-header"><div class="step-number">1</div>Start Your Dataset Project</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>📋 Project Configuration</h3> |
|
<p>Define your dataset project and choose the type of AI task you're building for.</p> |
|
</div> |
|
""") |
|
|
|
project_name = gr.Textbox( |
|
label="Project Name", |
|
placeholder="e.g., 'News Sentiment Analysis' or 'Product Review Classification'", |
|
value="My Dataset Project" |
|
) |
|
|
|
|
|
gr.HTML("<h4>🎨 Choose Your Dataset Template</h4>") |
|
|
|
template_choice = gr.Radio( |
|
choices=[ |
|
("📊 Text Classification", "text_classification"), |
|
("😊 Sentiment Analysis", "sentiment_analysis"), |
|
("👥 Named Entity Recognition", "named_entity_recognition"), |
|
("❓ Question Answering", "question_answering"), |
|
("📝 Text Summarization", "summarization") |
|
], |
|
label="Dataset Type", |
|
value="text_classification", |
|
interactive=True |
|
) |
|
|
|
create_project_btn = gr.Button( |
|
"🚀 Create Project", |
|
variant="primary", |
|
size="lg" |
|
) |
|
|
|
project_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>💡 Template Guide</h3> |
|
<div class="feature-grid"> |
|
<div class="feature-item"> |
|
<h4>📊 Text Classification</h4> |
|
<p>Categorize text into predefined labels</p> |
|
<small>Great for: Spam detection, topic classification</small> |
|
</div> |
|
<div class="feature-item"> |
|
<h4>😊 Sentiment Analysis</h4> |
|
<p>Analyze emotional tone and opinions</p> |
|
<small>Great for: Review analysis, social media monitoring</small> |
|
</div> |
|
<div class="feature-item"> |
|
<h4>👥 Named Entity Recognition</h4> |
|
<p>Identify people, places, organizations</p> |
|
<small>Great for: Information extraction, content tagging</small> |
|
</div> |
|
</div> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Tab("🕷️ Data Collection", id="collection"): |
|
gr.HTML('<div class="step-header"><div class="step-number">2</div>Collect Your Data</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>🌐 Web Scraping</h3> |
|
<p>Provide URLs to scrape content automatically. Our AI will extract clean, structured text.</p> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.Tab("📝 Manual Input"): |
|
urls_input = gr.Textbox( |
|
label="URLs to Scrape", |
|
placeholder="https://example.com/article1\nhttps://example.com/article2\n...", |
|
lines=8, |
|
info="Enter one URL per line" |
|
) |
|
|
|
with gr.Tab("📎 File Upload"): |
|
urls_file = gr.File( |
|
label="Upload URL List", |
|
file_types=[".txt", ".csv"], |
|
info="Upload a text file with URLs (one per line) or CSV with 'url' column" |
|
) |
|
|
|
scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg") |
|
|
|
|
|
scraping_progress = gr.Progress() |
|
scraping_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>⚡ Features</h3> |
|
<ul style="list-style: none; padding: 0;"> |
|
<li>✅ Smart content extraction</li> |
|
<li>✅ Quality scoring</li> |
|
<li>✅ Duplicate detection</li> |
|
<li>✅ Security validation</li> |
|
<li>✅ Metadata extraction</li> |
|
<li>✅ Rate limiting</li> |
|
</ul> |
|
</div> |
|
""") |
|
|
|
|
|
collection_stats = gr.HTML("") |
|
|
|
|
|
with gr.Tab("⚙️ Data Processing", id="processing"): |
|
gr.HTML('<div class="step-header"><div class="step-number">3</div>Clean & Enhance Your Data</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>🔧 Processing Options</h3> |
|
<p>Configure how to clean and enhance your scraped data with AI-powered analysis.</p> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
clean_text = gr.Checkbox(label="🧹 Advanced Text Cleaning", value=True) |
|
quality_filter = gr.Checkbox(label="🎯 Quality Filtering", value=True) |
|
detect_language = gr.Checkbox(label="🌍 Language Detection", value=True) |
|
|
|
with gr.Column(): |
|
add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False) |
|
extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False) |
|
deduplicate = gr.Checkbox(label="🔄 Remove Duplicates", value=True) |
|
|
|
process_btn = gr.Button("⚙️ Process Data", variant="primary", size="lg") |
|
processing_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>📊 Processing Stats</h3> |
|
<div id="processing-stats"></div> |
|
</div> |
|
""") |
|
|
|
processing_stats = gr.HTML("") |
|
|
|
|
|
with gr.Tab("👀 Data Preview", id="preview"): |
|
gr.HTML('<div class="step-header"><div class="step-number">4</div>Review Your Dataset</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>📋 Dataset Preview</h3> |
|
<p>Review your processed data before annotation or export.</p> |
|
</div> |
|
""") |
|
|
|
refresh_preview_btn = gr.Button("🔄 Refresh Preview", variant="secondary") |
|
|
|
|
|
data_preview = gr.DataFrame( |
|
headers=["Title", "Content Preview", "Word Count", "Quality Score", "URL"], |
|
label="Dataset Preview", |
|
interactive=False |
|
) |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>📈 Dataset Statistics</h3> |
|
</div> |
|
""") |
|
|
|
dataset_stats = gr.JSON(label="Statistics") |
|
|
|
|
|
with gr.Tab("📤 Export Dataset", id="export"): |
|
gr.HTML('<div class="step-header"><div class="step-number">5</div>Export Your Dataset</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>💾 Export Options</h3> |
|
<p>Export your dataset in various formats for different ML frameworks and platforms.</p> |
|
</div> |
|
""") |
|
|
|
|
|
export_format = gr.Radio( |
|
choices=[ |
|
("🤗 HuggingFace Datasets", "huggingface_datasets"), |
|
("📄 JSON", "json"), |
|
("📊 CSV", "csv"), |
|
("📋 JSONL", "jsonl"), |
|
("⚡ Parquet", "parquet") |
|
], |
|
label="Export Format", |
|
value="json" |
|
) |
|
|
|
|
|
export_template = gr.Dropdown( |
|
choices=[ |
|
"text_classification", |
|
"sentiment_analysis", |
|
"named_entity_recognition", |
|
"question_answering", |
|
"summarization" |
|
], |
|
label="Dataset Template", |
|
value="text_classification" |
|
) |
|
|
|
export_btn = gr.Button("📤 Export Dataset", variant="primary", size="lg") |
|
|
|
|
|
export_status = gr.Markdown("") |
|
export_file = gr.File(label="Download Dataset", visible=False) |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>📋 Export Formats</h3> |
|
<div class="feature-item"> |
|
<h4>🤗 HuggingFace</h4> |
|
<p>Ready for transformers library</p> |
|
</div> |
|
<div class="feature-item"> |
|
<h4>📄 JSON/JSONL</h4> |
|
<p>Universal format for any framework</p> |
|
</div> |
|
<div class="feature-item"> |
|
<h4>📊 CSV</h4> |
|
<p>Easy analysis in Excel/Pandas</p> |
|
</div> |
|
</div> |
|
""") |
|
|
|
|
|
def create_project(name, template): |
|
"""Create new project""" |
|
if not name.strip(): |
|
return "❌ Please enter a project name", {} |
|
|
|
project = studio.start_new_project(name.strip(), template) |
|
status = f""" |
|
✅ **Project Created Successfully!** |
|
|
|
**Project:** {project['name']} |
|
**Type:** {template.replace('_', ' ').title()} |
|
**ID:** {project['id'][:8]}... |
|
**Created:** {project['created_at'][:19]} |
|
|
|
👉 **Next Step:** Go to the Data Collection tab to start scraping URLs |
|
""" |
|
return status, project |
|
|
|
def scrape_urls_handler(urls_text, urls_file, project, progress=gr.Progress()): |
|
"""Handle URL scraping""" |
|
if not project: |
|
return "❌ Please create a project first", "" |
|
|
|
|
|
urls = [] |
|
if urls_text: |
|
urls = [url.strip() for url in urls_text.split('\n') if url.strip()] |
|
elif urls_file: |
|
|
|
try: |
|
content = urls_file.read().decode('utf-8') |
|
urls = [url.strip() for url in content.split('\n') if url.strip()] |
|
except: |
|
return "❌ Error reading uploaded file", "" |
|
|
|
if not urls: |
|
return "❌ No URLs provided", "" |
|
|
|
|
|
def progress_callback(pct, msg): |
|
progress(pct, desc=msg) |
|
|
|
|
|
success_count, errors = studio.scrape_urls(urls, progress_callback) |
|
|
|
if success_count > 0: |
|
stats_html = f""" |
|
<div class="stat-card"> |
|
<h3>✅ Scraping Complete</h3> |
|
<p><strong>{success_count}</strong> items collected</p> |
|
<p><strong>{len(urls) - success_count}</strong> failed</p> |
|
</div> |
|
""" |
|
|
|
status = f""" |
|
✅ **Scraping Complete!** |
|
|
|
**Successfully scraped:** {success_count} URLs |
|
**Failed:** {len(urls) - success_count} URLs |
|
|
|
👉 **Next Step:** Go to Data Processing tab to clean and enhance your data |
|
""" |
|
|
|
return status, stats_html |
|
else: |
|
return f"❌ Scraping failed: {', '.join(errors)}", "" |
|
|
|
def process_data_handler(clean_text, quality_filter, detect_language, |
|
add_sentiment, extract_entities, deduplicate, project): |
|
"""Handle data processing""" |
|
if not project: |
|
return "❌ Please create a project first", "" |
|
|
|
if not studio.scraped_items: |
|
return "❌ No scraped data to process. Please scrape URLs first.", "" |
|
|
|
|
|
options = { |
|
'clean_text': clean_text, |
|
'quality_filter': quality_filter, |
|
'detect_language': detect_language, |
|
'add_sentiment': add_sentiment, |
|
'extract_entities': extract_entities, |
|
'deduplicate': deduplicate |
|
} |
|
|
|
|
|
processed_count = studio.process_data(options) |
|
|
|
if processed_count > 0: |
|
stats = studio.get_data_statistics() |
|
stats_html = f""" |
|
<div class="stat-card"> |
|
<h3>⚙️ Processing Complete</h3> |
|
<p><strong>{processed_count}</strong> items processed</p> |
|
<p>Avg Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p> |
|
<p>Avg Words: <strong>{stats.get('avg_word_count', 0)}</strong></p> |
|
</div> |
|
""" |
|
|
|
status = f""" |
|
✅ **Processing Complete!** |
|
|
|
**Processed items:** {processed_count} |
|
**Average quality score:** {stats.get('avg_quality_score', 0)} |
|
**Average word count:** {stats.get('avg_word_count', 0)} |
|
|
|
👉 **Next Step:** Check the Data Preview tab to review your dataset |
|
""" |
|
|
|
return status, stats_html |
|
else: |
|
return "❌ No items passed processing filters", "" |
|
|
|
def refresh_preview_handler(project): |
|
"""Refresh data preview""" |
|
if not project: |
|
return None, {} |
|
|
|
preview_data = studio.get_data_preview() |
|
stats = studio.get_data_statistics() |
|
|
|
if preview_data: |
|
|
|
df_data = [] |
|
for item in preview_data: |
|
df_data.append([ |
|
item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], |
|
item['content_preview'], |
|
item['word_count'], |
|
item['quality_score'], |
|
item['url'][:50] + "..." if len(item['url']) > 50 else item['url'] |
|
]) |
|
|
|
return df_data, stats |
|
|
|
return None, {} |
|
|
|
def export_dataset_handler(export_format, export_template, project): |
|
"""Handle dataset export""" |
|
if not project: |
|
return "❌ Please create a project first", None |
|
|
|
if not studio.processed_items and not studio.scraped_items: |
|
return "❌ No data to export. Please scrape and process data first.", None |
|
|
|
try: |
|
|
|
filename = studio.export_dataset(export_template, export_format) |
|
|
|
status = f""" |
|
✅ **Export Successful!** |
|
|
|
**Format:** {export_format} |
|
**Template:** {export_template.replace('_', ' ').title()} |
|
**File:** {filename} |
|
|
|
📥 **Download your dataset using the link below** |
|
""" |
|
|
|
return status, filename |
|
|
|
except Exception as e: |
|
return f"❌ Export failed: {str(e)}", None |
|
|
|
|
|
create_project_btn.click( |
|
fn=create_project, |
|
inputs=[project_name, template_choice], |
|
outputs=[project_status, project_state] |
|
) |
|
|
|
scrape_btn.click( |
|
fn=scrape_urls_handler, |
|
inputs=[urls_input, urls_file, project_state], |
|
outputs=[scraping_status, collection_stats] |
|
) |
|
|
|
process_btn.click( |
|
fn=process_data_handler, |
|
inputs=[clean_text, quality_filter, detect_language, |
|
add_sentiment, extract_entities, deduplicate, project_state], |
|
outputs=[processing_status, processing_stats] |
|
) |
|
|
|
refresh_preview_btn.click( |
|
fn=refresh_preview_handler, |
|
inputs=[project_state], |
|
outputs=[data_preview, dataset_stats] |
|
) |
|
|
|
export_btn.click( |
|
fn=export_dataset_handler, |
|
inputs=[export_format, export_template, project_state], |
|
outputs=[export_status, export_file] |
|
) |
|
|
|
|
|
processing_status.change( |
|
fn=refresh_preview_handler, |
|
inputs=[project_state], |
|
outputs=[data_preview, dataset_stats] |
|
) |
|
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
logger.info("🚀 Starting AI Dataset Studio...") |
|
|
|
|
|
features = [] |
|
if HAS_TRANSFORMERS: |
|
features.append("✅ AI Models") |
|
else: |
|
features.append("⚠️ Basic Processing") |
|
|
|
if HAS_NLTK: |
|
features.append("✅ Advanced NLP") |
|
else: |
|
features.append("⚠️ Basic NLP") |
|
|
|
if HAS_DATASETS: |
|
features.append("✅ HuggingFace Integration") |
|
else: |
|
features.append("⚠️ Standard Export Only") |
|
|
|
logger.info(f"📊 Features: {' | '.join(features)}") |
|
|
|
try: |
|
interface = create_modern_interface() |
|
logger.info("✅ Interface created successfully") |
|
|
|
interface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True, |
|
debug=False |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"❌ Failed to launch application: {e}") |
|
raise |