|
""" |
|
AI Dataset Studio - Complete Application |
|
Fixed version with all classes properly defined |
|
""" |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
import json |
|
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, urljoin |
|
from datetime import datetime, timedelta |
|
import logging |
|
from typing import Dict, List, Tuple, Optional, Any |
|
from dataclasses import dataclass, asdict |
|
from pathlib import Path |
|
import uuid |
|
import hashlib |
|
import time |
|
from collections import defaultdict |
|
import io |
|
|
|
|
|
try: |
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
HAS_TRANSFORMERS = True |
|
except ImportError: |
|
HAS_TRANSFORMERS = False |
|
|
|
try: |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
HAS_NLTK = True |
|
except ImportError: |
|
HAS_NLTK = False |
|
|
|
try: |
|
from datasets import Dataset, DatasetDict |
|
HAS_DATASETS = True |
|
except ImportError: |
|
HAS_DATASETS = False |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
if HAS_NLTK: |
|
try: |
|
nltk.download('punkt', quiet=True) |
|
nltk.download('stopwords', quiet=True) |
|
nltk.download('averaged_perceptron_tagger', quiet=True) |
|
except: |
|
pass |
|
|
|
@dataclass |
|
class ScrapedItem: |
|
"""Data class for scraped content""" |
|
id: str |
|
url: str |
|
title: str |
|
content: str |
|
metadata: Dict[str, Any] |
|
scraped_at: str |
|
word_count: int |
|
language: str = "en" |
|
quality_score: float = 0.0 |
|
labels: List[str] = None |
|
annotations: Dict[str, Any] = None |
|
|
|
def __post_init__(self): |
|
if self.labels is None: |
|
self.labels = [] |
|
if self.annotations is None: |
|
self.annotations = {} |
|
|
|
@dataclass |
|
class DatasetTemplate: |
|
"""Template for dataset creation""" |
|
name: str |
|
description: str |
|
task_type: str |
|
required_fields: List[str] |
|
optional_fields: List[str] |
|
example_format: Dict[str, Any] |
|
instructions: str |
|
|
|
class SecurityValidator: |
|
"""Security validation for URLs and content""" |
|
|
|
ALLOWED_SCHEMES = {'http', 'https'} |
|
BLOCKED_DOMAINS = { |
|
'localhost', '127.0.0.1', '0.0.0.0', |
|
'192.168.', '10.', '172.16.', '172.17.', |
|
'172.18.', '172.19.', '172.20.', '172.21.', |
|
'172.22.', '172.23.', '172.24.', '172.25.', |
|
'172.26.', '172.27.', '172.28.', '172.29.', |
|
'172.30.', '172.31.' |
|
} |
|
|
|
@classmethod |
|
def validate_url(cls, url: str) -> Tuple[bool, str]: |
|
"""Validate URL for security concerns""" |
|
try: |
|
parsed = urlparse(url) |
|
|
|
if parsed.scheme not in cls.ALLOWED_SCHEMES: |
|
return False, f"Invalid scheme: {parsed.scheme}" |
|
|
|
hostname = parsed.hostname or '' |
|
if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS): |
|
return False, "Access to internal networks not allowed" |
|
|
|
if not parsed.netloc: |
|
return False, "Invalid URL format" |
|
|
|
return True, "URL is valid" |
|
|
|
except Exception as e: |
|
return False, f"URL validation error: {str(e)}" |
|
|
|
class WebScraperEngine: |
|
"""Advanced web scraping engine""" |
|
|
|
def __init__(self): |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Connection': 'keep-alive', |
|
}) |
|
|
|
def scrape_url(self, url: str) -> Optional[ScrapedItem]: |
|
"""Scrape a single URL""" |
|
try: |
|
|
|
is_valid, validation_msg = SecurityValidator.validate_url(url) |
|
if not is_valid: |
|
raise ValueError(f"Security validation failed: {validation_msg}") |
|
|
|
|
|
response = self.session.get(url, timeout=15) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
title = self._extract_title(soup) |
|
content = self._extract_content(soup) |
|
metadata = self._extract_metadata(soup, response) |
|
|
|
|
|
item = ScrapedItem( |
|
id=str(uuid.uuid4()), |
|
url=url, |
|
title=title, |
|
content=content, |
|
metadata=metadata, |
|
scraped_at=datetime.now().isoformat(), |
|
word_count=len(content.split()), |
|
quality_score=self._assess_quality(content) |
|
) |
|
|
|
return item |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to scrape {url}: {e}") |
|
return None |
|
|
|
def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]: |
|
"""Scrape multiple URLs""" |
|
results = [] |
|
total = len(urls) |
|
|
|
for i, url in enumerate(urls): |
|
if progress_callback: |
|
progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...") |
|
|
|
item = self.scrape_url(url) |
|
if item: |
|
results.append(item) |
|
|
|
time.sleep(1) |
|
|
|
return results |
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> str: |
|
"""Extract page title""" |
|
title_tag = soup.find('title') |
|
if title_tag: |
|
return title_tag.get_text().strip() |
|
|
|
h1_tag = soup.find('h1') |
|
if h1_tag: |
|
return h1_tag.get_text().strip() |
|
|
|
return "Untitled" |
|
|
|
def _extract_content(self, soup: BeautifulSoup) -> str: |
|
"""Extract main content""" |
|
|
|
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): |
|
element.decompose() |
|
|
|
|
|
content_selectors = [ |
|
'article', 'main', '.content', '.post-content', |
|
'.entry-content', '.article-body' |
|
] |
|
|
|
for selector in content_selectors: |
|
element = soup.select_one(selector) |
|
if element: |
|
text = element.get_text(separator=' ', strip=True) |
|
if len(text) > 200: |
|
return self._clean_text(text) |
|
|
|
|
|
body = soup.find('body') |
|
if body: |
|
return self._clean_text(body.get_text(separator=' ', strip=True)) |
|
|
|
return self._clean_text(soup.get_text(separator=' ', strip=True)) |
|
|
|
def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]: |
|
"""Extract metadata""" |
|
metadata = { |
|
'domain': urlparse(response.url).netloc, |
|
'status_code': response.status_code, |
|
'extracted_at': datetime.now().isoformat() |
|
} |
|
|
|
|
|
for tag in ['description', 'keywords', 'author']: |
|
element = soup.find('meta', attrs={'name': tag}) |
|
if element: |
|
metadata[tag] = element.get('content', '') |
|
|
|
return metadata |
|
|
|
def _clean_text(self, text: str) -> str: |
|
"""Clean extracted text""" |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE) |
|
text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE) |
|
return text.strip() |
|
|
|
def _assess_quality(self, content: str) -> float: |
|
"""Assess content quality""" |
|
if not content: |
|
return 0.0 |
|
|
|
score = 0.0 |
|
word_count = len(content.split()) |
|
|
|
if word_count >= 50: |
|
score += 0.4 |
|
elif word_count >= 20: |
|
score += 0.2 |
|
|
|
sentence_count = len(re.split(r'[.!?]+', content)) |
|
if sentence_count >= 3: |
|
score += 0.3 |
|
|
|
if re.search(r'[A-Z][a-z]+', content): |
|
score += 0.3 |
|
|
|
return min(score, 1.0) |
|
|
|
class DataProcessor: |
|
"""Data processing pipeline""" |
|
|
|
def __init__(self): |
|
self.sentiment_analyzer = None |
|
self.ner_model = None |
|
self._load_models() |
|
|
|
def _load_models(self): |
|
"""Load NLP models""" |
|
if not HAS_TRANSFORMERS: |
|
logger.warning("β οΈ Transformers not available") |
|
return |
|
|
|
try: |
|
self.sentiment_analyzer = pipeline( |
|
"sentiment-analysis", |
|
model="cardiffnlp/twitter-roberta-base-sentiment-latest" |
|
) |
|
logger.info("β
Sentiment model loaded") |
|
except Exception as e: |
|
logger.warning(f"β οΈ Could not load sentiment model: {e}") |
|
|
|
def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]: |
|
"""Process scraped items""" |
|
processed = [] |
|
|
|
for item in items: |
|
try: |
|
|
|
if options.get('clean_text', True): |
|
item.content = self._clean_text_advanced(item.content) |
|
|
|
|
|
if options.get('quality_filter', True) and item.quality_score < 0.3: |
|
continue |
|
|
|
|
|
if options.get('add_sentiment', False) and self.sentiment_analyzer: |
|
sentiment = self._analyze_sentiment(item.content) |
|
item.metadata['sentiment'] = sentiment |
|
|
|
|
|
if options.get('detect_language', True): |
|
item.language = self._detect_language(item.content) |
|
|
|
processed.append(item) |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing item {item.id}: {e}") |
|
continue |
|
|
|
return processed |
|
|
|
def _clean_text_advanced(self, text: str) -> str: |
|
"""Advanced text cleaning""" |
|
text = re.sub(r'http\S+|www\.\S+', '', text) |
|
text = re.sub(r'\S+@\S+', '', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
return text.strip() |
|
|
|
def _analyze_sentiment(self, text: str) -> Dict[str, Any]: |
|
"""Analyze sentiment""" |
|
try: |
|
text_sample = text[:512] |
|
result = self.sentiment_analyzer(text_sample)[0] |
|
return { |
|
'label': result['label'], |
|
'score': result['score'] |
|
} |
|
except: |
|
return {'label': 'UNKNOWN', 'score': 0.0} |
|
|
|
def _detect_language(self, text: str) -> str: |
|
"""Simple language detection""" |
|
if re.search(r'[Π°-ΡΡ]', text.lower()): |
|
return 'ru' |
|
elif re.search(r'[ñÑéΓΓ³ΓΊΓΌ]', text.lower()): |
|
return 'es' |
|
return 'en' |
|
|
|
class AnnotationEngine: |
|
"""Annotation tools for dataset creation""" |
|
|
|
def __init__(self): |
|
self.templates = self._load_templates() |
|
|
|
def _load_templates(self) -> Dict[str, DatasetTemplate]: |
|
"""Load dataset templates""" |
|
templates = { |
|
'text_classification': DatasetTemplate( |
|
name="Text Classification", |
|
description="Classify text into categories", |
|
task_type="classification", |
|
required_fields=["text", "label"], |
|
optional_fields=["confidence", "metadata"], |
|
example_format={"text": "Sample text", "label": "positive"}, |
|
instructions="Label each text with appropriate category" |
|
), |
|
'sentiment_analysis': DatasetTemplate( |
|
name="Sentiment Analysis", |
|
description="Analyze emotional tone", |
|
task_type="classification", |
|
required_fields=["text", "sentiment"], |
|
optional_fields=["confidence", "aspects"], |
|
example_format={"text": "I love this!", "sentiment": "positive"}, |
|
instructions="Classify sentiment as positive, negative, or neutral" |
|
), |
|
'named_entity_recognition': DatasetTemplate( |
|
name="Named Entity Recognition", |
|
description="Identify named entities", |
|
task_type="ner", |
|
required_fields=["text", "entities"], |
|
optional_fields=["metadata"], |
|
example_format={ |
|
"text": "John works at OpenAI", |
|
"entities": [{"text": "John", "label": "PERSON"}] |
|
}, |
|
instructions="Mark all named entities" |
|
), |
|
'question_answering': DatasetTemplate( |
|
name="Question Answering", |
|
description="Create Q&A pairs", |
|
task_type="qa", |
|
required_fields=["context", "question", "answer"], |
|
optional_fields=["answer_start", "metadata"], |
|
example_format={ |
|
"context": "The capital of France is Paris.", |
|
"question": "What is the capital of France?", |
|
"answer": "Paris" |
|
}, |
|
instructions="Create meaningful questions and answers" |
|
), |
|
'summarization': DatasetTemplate( |
|
name="Text Summarization", |
|
description="Create summaries", |
|
task_type="summarization", |
|
required_fields=["text", "summary"], |
|
optional_fields=["summary_type", "length"], |
|
example_format={ |
|
"text": "Long article text...", |
|
"summary": "Brief summary" |
|
}, |
|
instructions="Write clear, concise summaries" |
|
) |
|
} |
|
return templates |
|
|
|
class DatasetExporter: |
|
"""Export datasets in various formats""" |
|
|
|
def __init__(self): |
|
self.supported_formats = [ |
|
'json', 'csv', 'jsonl', 'huggingface_datasets' |
|
] |
|
|
|
def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate, |
|
export_format: str, annotations: Dict[str, Any] = None) -> str: |
|
"""Export dataset""" |
|
try: |
|
dataset_data = self._prepare_data(items, template, annotations) |
|
|
|
if export_format == 'json': |
|
return self._export_json(dataset_data) |
|
elif export_format == 'csv': |
|
return self._export_csv(dataset_data) |
|
elif export_format == 'jsonl': |
|
return self._export_jsonl(dataset_data) |
|
elif export_format == 'huggingface_datasets': |
|
return self._export_huggingface(dataset_data, template) |
|
else: |
|
raise ValueError(f"Unsupported format: {export_format}") |
|
|
|
except Exception as e: |
|
logger.error(f"Export failed: {e}") |
|
raise |
|
|
|
def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate, |
|
annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]: |
|
"""Prepare data according to template""" |
|
dataset_data = [] |
|
|
|
for item in items: |
|
data_point = { |
|
'text': item.content, |
|
'title': item.title, |
|
'url': item.url, |
|
'metadata': item.metadata |
|
} |
|
|
|
if annotations and item.id in annotations: |
|
data_point.update(annotations[item.id]) |
|
|
|
formatted = self._format_for_template(data_point, template) |
|
if formatted: |
|
dataset_data.append(formatted) |
|
|
|
return dataset_data |
|
|
|
def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]: |
|
"""Format data according to template""" |
|
formatted = {} |
|
|
|
for field in template.required_fields: |
|
if field in data_point: |
|
formatted[field] = data_point[field] |
|
elif field == 'text' and 'content' in data_point: |
|
formatted[field] = data_point['content'] |
|
else: |
|
return None |
|
|
|
for field in template.optional_fields: |
|
if field in data_point: |
|
formatted[field] = data_point[field] |
|
|
|
return formatted |
|
|
|
def _export_json(self, data: List[Dict[str, Any]]) -> str: |
|
"""Export as JSON""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.json" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
return filename |
|
|
|
def _export_csv(self, data: List[Dict[str, Any]]) -> str: |
|
"""Export as CSV""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.csv" |
|
|
|
df = pd.DataFrame(data) |
|
df.to_csv(filename, index=False) |
|
|
|
return filename |
|
|
|
def _export_jsonl(self, data: List[Dict[str, Any]]) -> str: |
|
"""Export as JSONL""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"dataset_{timestamp}.jsonl" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
for item in data: |
|
f.write(json.dumps(item, ensure_ascii=False) + '\n') |
|
|
|
return filename |
|
|
|
def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str: |
|
"""Export as HuggingFace Dataset""" |
|
if not HAS_DATASETS: |
|
raise ImportError("datasets library not available") |
|
|
|
dataset = Dataset.from_list(data) |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}" |
|
|
|
dataset.save_to_disk(dataset_name) |
|
return dataset_name |
|
|
|
class DatasetStudio: |
|
"""Main application orchestrator""" |
|
|
|
def __init__(self): |
|
self.scraper = WebScraperEngine() |
|
self.processor = DataProcessor() |
|
self.annotator = AnnotationEngine() |
|
self.exporter = DatasetExporter() |
|
|
|
|
|
self.scraped_items = [] |
|
self.processed_items = [] |
|
self.current_project = None |
|
self.annotation_state = {} |
|
|
|
logger.info("β
DatasetStudio initialized successfully") |
|
|
|
def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]: |
|
"""Start new project""" |
|
self.current_project = { |
|
'name': project_name, |
|
'template': template_type, |
|
'created_at': datetime.now().isoformat(), |
|
'id': str(uuid.uuid4()) |
|
} |
|
|
|
self.scraped_items = [] |
|
self.processed_items = [] |
|
self.annotation_state = {} |
|
|
|
logger.info(f"π New project: {project_name}") |
|
return self.current_project |
|
|
|
def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: |
|
"""Scrape URLs""" |
|
url_list = [url.strip() for url in urls if url.strip()] |
|
|
|
if not url_list: |
|
return 0, ["No valid URLs provided"] |
|
|
|
logger.info(f"π·οΈ Scraping {len(url_list)} URLs") |
|
self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) |
|
|
|
success = len(self.scraped_items) |
|
failed = len(url_list) - success |
|
|
|
errors = [] |
|
if failed > 0: |
|
errors.append(f"{failed} URLs failed") |
|
|
|
logger.info(f"β
Scraped {success}, failed {failed}") |
|
return success, errors |
|
|
|
def process_data(self, options: Dict[str, bool]) -> int: |
|
"""Process scraped data""" |
|
if not self.scraped_items: |
|
return 0 |
|
|
|
logger.info(f"βοΈ Processing {len(self.scraped_items)} items") |
|
self.processed_items = self.processor.process_items(self.scraped_items, options) |
|
|
|
logger.info(f"β
Processed {len(self.processed_items)} items") |
|
return len(self.processed_items) |
|
|
|
def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]: |
|
"""Get data preview""" |
|
items = self.processed_items or self.scraped_items |
|
|
|
preview = [] |
|
for item in items[:num_items]: |
|
preview.append({ |
|
'title': item.title, |
|
'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content, |
|
'word_count': item.word_count, |
|
'quality_score': round(item.quality_score, 2), |
|
'url': item.url |
|
}) |
|
|
|
return preview |
|
|
|
def get_data_statistics(self) -> Dict[str, Any]: |
|
"""Get dataset statistics""" |
|
items = self.processed_items or self.scraped_items |
|
|
|
if not items: |
|
return {} |
|
|
|
word_counts = [item.word_count for item in items] |
|
quality_scores = [item.quality_score for item in items] |
|
|
|
return { |
|
'total_items': len(items), |
|
'avg_word_count': round(np.mean(word_counts)), |
|
'avg_quality_score': round(np.mean(quality_scores), 2), |
|
'word_count_range': [min(word_counts), max(word_counts)], |
|
'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)], |
|
'languages': list(set(item.language for item in items)), |
|
'domains': list(set(urlparse(item.url).netloc for item in items)) |
|
} |
|
|
|
def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str: |
|
"""Export dataset""" |
|
if not self.processed_items and not self.scraped_items: |
|
raise ValueError("No data to export") |
|
|
|
items = self.processed_items or self.scraped_items |
|
template = self.annotator.templates.get(template_name) |
|
|
|
if not template: |
|
raise ValueError(f"Unknown template: {template_name}") |
|
|
|
logger.info(f"π€ Exporting {len(items)} items") |
|
return self.exporter.export_dataset(items, template, export_format, annotations) |
|
|
|
def create_modern_interface(): |
|
"""Create the modern Gradio interface""" |
|
|
|
|
|
studio = DatasetStudio() |
|
|
|
|
|
css = """ |
|
.gradio-container { max-width: 1400px; margin: auto; } |
|
.studio-header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; padding: 2rem; border-radius: 15px; |
|
margin-bottom: 2rem; text-align: center; |
|
} |
|
.workflow-card { |
|
background: #f8f9ff; border: 2px solid #e1e5ff; |
|
border-radius: 12px; padding: 1.5rem; margin: 1rem 0; |
|
} |
|
.step-header { |
|
font-size: 1.2em; font-weight: 600; color: #4c51bf; |
|
margin-bottom: 1rem; |
|
} |
|
""" |
|
|
|
project_state = gr.State({}) |
|
|
|
with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface: |
|
|
|
|
|
gr.HTML(""" |
|
<div class="studio-header"> |
|
<h1>π AI Dataset Studio</h1> |
|
<p>Create high-quality training datasets without coding</p> |
|
</div> |
|
""") |
|
|
|
with gr.Tabs() as main_tabs: |
|
|
|
|
|
with gr.Tab("π― Project Setup"): |
|
gr.HTML('<div class="step-header">Step 1: Create Your Project</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
project_name = gr.Textbox( |
|
label="Project Name", |
|
placeholder="My Dataset Project", |
|
value="News Analysis Dataset" |
|
) |
|
|
|
template_choice = gr.Radio( |
|
choices=[ |
|
("π Text Classification", "text_classification"), |
|
("π Sentiment Analysis", "sentiment_analysis"), |
|
("π₯ Named Entity Recognition", "named_entity_recognition"), |
|
("β Question Answering", "question_answering"), |
|
("π Text Summarization", "summarization") |
|
], |
|
label="Dataset Type", |
|
value="text_classification" |
|
) |
|
|
|
create_project_btn = gr.Button("π Create Project", variant="primary") |
|
project_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>π‘ Template Guide</h3> |
|
<p><strong>Text Classification:</strong> Categorize content</p> |
|
<p><strong>Sentiment Analysis:</strong> Analyze emotions</p> |
|
<p><strong>Named Entity Recognition:</strong> Identify entities</p> |
|
<p><strong>Question Answering:</strong> Create Q&A pairs</p> |
|
<p><strong>Summarization:</strong> Generate summaries</p> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Tab("π·οΈ Data Collection"): |
|
gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
urls_input = gr.Textbox( |
|
label="URLs to Scrape (one per line)", |
|
placeholder="https://example.com/article1\nhttps://example.com/article2", |
|
lines=8 |
|
) |
|
|
|
scrape_btn = gr.Button("π Start Scraping", variant="primary") |
|
scraping_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
collection_stats = gr.HTML("") |
|
|
|
|
|
with gr.Tab("βοΈ Data Processing"): |
|
gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
with gr.Row(): |
|
with gr.Column(): |
|
clean_text = gr.Checkbox(label="π§Ή Text Cleaning", value=True) |
|
quality_filter = gr.Checkbox(label="π― Quality Filter", value=True) |
|
detect_language = gr.Checkbox(label="π Language Detection", value=True) |
|
|
|
with gr.Column(): |
|
add_sentiment = gr.Checkbox(label="π Sentiment Analysis", value=False) |
|
extract_entities = gr.Checkbox(label="π₯ Entity Extraction", value=False) |
|
|
|
process_btn = gr.Button("βοΈ Process Data", variant="primary") |
|
processing_status = gr.Markdown("") |
|
|
|
with gr.Column(scale=1): |
|
processing_stats = gr.HTML("") |
|
|
|
|
|
with gr.Tab("π Data Preview"): |
|
gr.HTML('<div class="step-header">Step 4: Review Dataset</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
refresh_btn = gr.Button("π Refresh Preview", variant="secondary") |
|
|
|
data_preview = gr.DataFrame( |
|
headers=["Title", "Content Preview", "Words", "Quality", "URL"], |
|
label="Dataset Preview" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
dataset_stats = gr.JSON(label="Statistics") |
|
|
|
|
|
with gr.Tab("π€ Export Dataset"): |
|
gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
export_format = gr.Radio( |
|
choices=[ |
|
("π JSON", "json"), |
|
("π CSV", "csv"), |
|
("π JSONL", "jsonl"), |
|
("π€ HuggingFace", "huggingface_datasets") |
|
], |
|
label="Export Format", |
|
value="json" |
|
) |
|
|
|
export_template = gr.Dropdown( |
|
choices=[ |
|
"text_classification", |
|
"sentiment_analysis", |
|
"named_entity_recognition", |
|
"question_answering", |
|
"summarization" |
|
], |
|
label="Template", |
|
value="text_classification" |
|
) |
|
|
|
export_btn = gr.Button("π€ Export Dataset", variant="primary") |
|
export_status = gr.Markdown("") |
|
export_file = gr.File(label="Download", visible=False) |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div class="workflow-card"> |
|
<h3>π Export Info</h3> |
|
<p><strong>JSON:</strong> Universal format</p> |
|
<p><strong>CSV:</strong> Excel compatible</p> |
|
<p><strong>JSONL:</strong> Line-separated</p> |
|
<p><strong>HuggingFace:</strong> ML ready</p> |
|
</div> |
|
""") |
|
|
|
|
|
def create_project(name, template): |
|
if not name.strip(): |
|
return "β Please enter a project name", {} |
|
|
|
project = studio.start_new_project(name.strip(), template) |
|
status = f""" |
|
β
**Project Created!** |
|
|
|
**Name:** {project['name']} |
|
**Type:** {template.replace('_', ' ').title()} |
|
**ID:** {project['id'][:8]}... |
|
|
|
π Next: Go to Data Collection tab |
|
""" |
|
return status, project |
|
|
|
def scrape_urls_handler(urls_text, project, progress=gr.Progress()): |
|
if not project: |
|
return "β Create a project first", "" |
|
|
|
urls = [url.strip() for url in urls_text.split('\n') if url.strip()] |
|
if not urls: |
|
return "β No URLs provided", "" |
|
|
|
def progress_callback(pct, msg): |
|
progress(pct, desc=msg) |
|
|
|
success, errors = studio.scrape_urls(urls, progress_callback) |
|
|
|
if success > 0: |
|
stats = f""" |
|
<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;"> |
|
<h3>β
Scraping Complete</h3> |
|
<p><strong>{success}</strong> items collected</p> |
|
</div> |
|
""" |
|
|
|
status = f""" |
|
β
**Scraping Complete!** |
|
|
|
**Success:** {success} URLs |
|
**Failed:** {len(urls) - success} URLs |
|
|
|
π Next: Go to Data Processing tab |
|
""" |
|
|
|
return status, stats |
|
else: |
|
return f"β Scraping failed: {', '.join(errors)}", "" |
|
|
|
def process_data_handler(clean, quality, language, sentiment, entities, project): |
|
if not project: |
|
return "β Create a project first", "" |
|
|
|
if not studio.scraped_items: |
|
return "β No data to process. Scrape URLs first.", "" |
|
|
|
options = { |
|
'clean_text': clean, |
|
'quality_filter': quality, |
|
'detect_language': language, |
|
'add_sentiment': sentiment, |
|
'extract_entities': entities |
|
} |
|
|
|
processed = studio.process_data(options) |
|
|
|
if processed > 0: |
|
stats = studio.get_data_statistics() |
|
stats_html = f""" |
|
<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;"> |
|
<h3>βοΈ Processing Complete</h3> |
|
<p><strong>{processed}</strong> items processed</p> |
|
<p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p> |
|
</div> |
|
""" |
|
|
|
status = f""" |
|
β
**Processing Complete!** |
|
|
|
**Processed:** {processed} items |
|
**Avg Quality:** {stats.get('avg_quality_score', 0)} |
|
|
|
π Next: Check Data Preview tab |
|
""" |
|
|
|
return status, stats_html |
|
else: |
|
return "β No items passed filters", "" |
|
|
|
def refresh_preview_handler(project): |
|
if not project: |
|
return None, {} |
|
|
|
preview = studio.get_data_preview() |
|
stats = studio.get_data_statistics() |
|
|
|
if preview: |
|
df_data = [] |
|
for item in preview: |
|
df_data.append([ |
|
item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], |
|
item['content_preview'], |
|
item['word_count'], |
|
item['quality_score'], |
|
item['url'][:50] + "..." if len(item['url']) > 50 else item['url'] |
|
]) |
|
|
|
return df_data, stats |
|
|
|
return None, {} |
|
|
|
def export_handler(format_type, template, project): |
|
if not project: |
|
return "β Create a project first", None |
|
|
|
if not studio.processed_items and not studio.scraped_items: |
|
return "β No data to export", None |
|
|
|
try: |
|
filename = studio.export_dataset(template, format_type) |
|
|
|
status = f""" |
|
β
**Export Successful!** |
|
|
|
**Format:** {format_type} |
|
**File:** {filename} |
|
|
|
π₯ Download link below |
|
""" |
|
|
|
return status, filename |
|
|
|
except Exception as e: |
|
return f"β Export failed: {str(e)}", None |
|
|
|
|
|
create_project_btn.click( |
|
fn=create_project, |
|
inputs=[project_name, template_choice], |
|
outputs=[project_status, project_state] |
|
) |
|
|
|
scrape_btn.click( |
|
fn=scrape_urls_handler, |
|
inputs=[urls_input, project_state], |
|
outputs=[scraping_status, collection_stats] |
|
) |
|
|
|
process_btn.click( |
|
fn=process_data_handler, |
|
inputs=[clean_text, quality_filter, detect_language, |
|
add_sentiment, extract_entities, project_state], |
|
outputs=[processing_status, processing_stats] |
|
) |
|
|
|
refresh_btn.click( |
|
fn=refresh_preview_handler, |
|
inputs=[project_state], |
|
outputs=[data_preview, dataset_stats] |
|
) |
|
|
|
export_btn.click( |
|
fn=export_handler, |
|
inputs=[export_format, export_template, project_state], |
|
outputs=[export_status, export_file] |
|
) |
|
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
logger.info("π Starting AI Dataset Studio...") |
|
|
|
|
|
features = [] |
|
if HAS_TRANSFORMERS: |
|
features.append("β
AI Models") |
|
else: |
|
features.append("β οΈ Basic Processing") |
|
|
|
if HAS_NLTK: |
|
features.append("β
Advanced NLP") |
|
else: |
|
features.append("β οΈ Basic NLP") |
|
|
|
if HAS_DATASETS: |
|
features.append("β
HuggingFace Integration") |
|
else: |
|
features.append("β οΈ Standard Export") |
|
|
|
logger.info(f"π Features: {' | '.join(features)}") |
|
|
|
try: |
|
|
|
test_studio = DatasetStudio() |
|
logger.info("β
DatasetStudio test passed") |
|
|
|
interface = create_modern_interface() |
|
logger.info("β
Interface created successfully") |
|
|
|
interface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"β Failed to launch: {e}") |
|
logger.error("π‘ Try: python app_minimal.py") |
|
raise |