MagicMeWizard's picture
Update app.py
6d85bb5 verified
raw
history blame
38.2 kB
"""
AI Dataset Studio - Complete Application
Fixed version with all classes properly defined
"""
import gradio as gr
import pandas as pd
import numpy as np
import json
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from datetime import datetime, timedelta
import logging
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from pathlib import Path
import uuid
import hashlib
import time
from collections import defaultdict
import io
# Optional imports with fallbacks
try:
from transformers import pipeline, AutoTokenizer, AutoModel
HAS_TRANSFORMERS = True
except ImportError:
HAS_TRANSFORMERS = False
try:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
HAS_NLTK = True
except ImportError:
HAS_NLTK = False
try:
from datasets import Dataset, DatasetDict
HAS_DATASETS = True
except ImportError:
HAS_DATASETS = False
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Download NLTK data if available
if HAS_NLTK:
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except:
pass
@dataclass
class ScrapedItem:
"""Data class for scraped content"""
id: str
url: str
title: str
content: str
metadata: Dict[str, Any]
scraped_at: str
word_count: int
language: str = "en"
quality_score: float = 0.0
labels: List[str] = None
annotations: Dict[str, Any] = None
def __post_init__(self):
if self.labels is None:
self.labels = []
if self.annotations is None:
self.annotations = {}
@dataclass
class DatasetTemplate:
"""Template for dataset creation"""
name: str
description: str
task_type: str
required_fields: List[str]
optional_fields: List[str]
example_format: Dict[str, Any]
instructions: str
class SecurityValidator:
"""Security validation for URLs and content"""
ALLOWED_SCHEMES = {'http', 'https'}
BLOCKED_DOMAINS = {
'localhost', '127.0.0.1', '0.0.0.0',
'192.168.', '10.', '172.16.', '172.17.',
'172.18.', '172.19.', '172.20.', '172.21.',
'172.22.', '172.23.', '172.24.', '172.25.',
'172.26.', '172.27.', '172.28.', '172.29.',
'172.30.', '172.31.'
}
@classmethod
def validate_url(cls, url: str) -> Tuple[bool, str]:
"""Validate URL for security concerns"""
try:
parsed = urlparse(url)
if parsed.scheme not in cls.ALLOWED_SCHEMES:
return False, f"Invalid scheme: {parsed.scheme}"
hostname = parsed.hostname or ''
if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
return False, "Access to internal networks not allowed"
if not parsed.netloc:
return False, "Invalid URL format"
return True, "URL is valid"
except Exception as e:
return False, f"URL validation error: {str(e)}"
class WebScraperEngine:
"""Advanced web scraping engine"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
})
def scrape_url(self, url: str) -> Optional[ScrapedItem]:
"""Scrape a single URL"""
try:
# Validate URL
is_valid, validation_msg = SecurityValidator.validate_url(url)
if not is_valid:
raise ValueError(f"Security validation failed: {validation_msg}")
# Fetch content
response = self.session.get(url, timeout=15)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data
title = self._extract_title(soup)
content = self._extract_content(soup)
metadata = self._extract_metadata(soup, response)
# Create item
item = ScrapedItem(
id=str(uuid.uuid4()),
url=url,
title=title,
content=content,
metadata=metadata,
scraped_at=datetime.now().isoformat(),
word_count=len(content.split()),
quality_score=self._assess_quality(content)
)
return item
except Exception as e:
logger.error(f"Failed to scrape {url}: {e}")
return None
def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
"""Scrape multiple URLs"""
results = []
total = len(urls)
for i, url in enumerate(urls):
if progress_callback:
progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")
item = self.scrape_url(url)
if item:
results.append(item)
time.sleep(1) # Rate limiting
return results
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract page title"""
title_tag = soup.find('title')
if title_tag:
return title_tag.get_text().strip()
h1_tag = soup.find('h1')
if h1_tag:
return h1_tag.get_text().strip()
return "Untitled"
def _extract_content(self, soup: BeautifulSoup) -> str:
"""Extract main content"""
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
element.decompose()
# Try content selectors
content_selectors = [
'article', 'main', '.content', '.post-content',
'.entry-content', '.article-body'
]
for selector in content_selectors:
element = soup.select_one(selector)
if element:
text = element.get_text(separator=' ', strip=True)
if len(text) > 200:
return self._clean_text(text)
# Fallback to body
body = soup.find('body')
if body:
return self._clean_text(body.get_text(separator=' ', strip=True))
return self._clean_text(soup.get_text(separator=' ', strip=True))
def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
"""Extract metadata"""
metadata = {
'domain': urlparse(response.url).netloc,
'status_code': response.status_code,
'extracted_at': datetime.now().isoformat()
}
# Extract meta tags
for tag in ['description', 'keywords', 'author']:
element = soup.find('meta', attrs={'name': tag})
if element:
metadata[tag] = element.get('content', '')
return metadata
def _clean_text(self, text: str) -> str:
"""Clean extracted text"""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
return text.strip()
def _assess_quality(self, content: str) -> float:
"""Assess content quality"""
if not content:
return 0.0
score = 0.0
word_count = len(content.split())
if word_count >= 50:
score += 0.4
elif word_count >= 20:
score += 0.2
sentence_count = len(re.split(r'[.!?]+', content))
if sentence_count >= 3:
score += 0.3
if re.search(r'[A-Z][a-z]+', content):
score += 0.3
return min(score, 1.0)
class DataProcessor:
"""Data processing pipeline"""
def __init__(self):
self.sentiment_analyzer = None
self.ner_model = None
self._load_models()
def _load_models(self):
"""Load NLP models"""
if not HAS_TRANSFORMERS:
logger.warning("⚠️ Transformers not available")
return
try:
self.sentiment_analyzer = pipeline(
"sentiment-analysis",
model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)
logger.info("βœ… Sentiment model loaded")
except Exception as e:
logger.warning(f"⚠️ Could not load sentiment model: {e}")
def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]:
"""Process scraped items"""
processed = []
for item in items:
try:
# Clean text
if options.get('clean_text', True):
item.content = self._clean_text_advanced(item.content)
# Quality filter
if options.get('quality_filter', True) and item.quality_score < 0.3:
continue
# Add sentiment
if options.get('add_sentiment', False) and self.sentiment_analyzer:
sentiment = self._analyze_sentiment(item.content)
item.metadata['sentiment'] = sentiment
# Language detection
if options.get('detect_language', True):
item.language = self._detect_language(item.content)
processed.append(item)
except Exception as e:
logger.error(f"Error processing item {item.id}: {e}")
continue
return processed
def _clean_text_advanced(self, text: str) -> str:
"""Advanced text cleaning"""
text = re.sub(r'http\S+|www\.\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
"""Analyze sentiment"""
try:
text_sample = text[:512]
result = self.sentiment_analyzer(text_sample)[0]
return {
'label': result['label'],
'score': result['score']
}
except:
return {'label': 'UNKNOWN', 'score': 0.0}
def _detect_language(self, text: str) -> str:
"""Simple language detection"""
if re.search(r'[Π°-яё]', text.lower()):
return 'ru'
elif re.search(r'[ñÑéíóúü]', text.lower()):
return 'es'
return 'en'
class AnnotationEngine:
"""Annotation tools for dataset creation"""
def __init__(self):
self.templates = self._load_templates()
def _load_templates(self) -> Dict[str, DatasetTemplate]:
"""Load dataset templates"""
templates = {
'text_classification': DatasetTemplate(
name="Text Classification",
description="Classify text into categories",
task_type="classification",
required_fields=["text", "label"],
optional_fields=["confidence", "metadata"],
example_format={"text": "Sample text", "label": "positive"},
instructions="Label each text with appropriate category"
),
'sentiment_analysis': DatasetTemplate(
name="Sentiment Analysis",
description="Analyze emotional tone",
task_type="classification",
required_fields=["text", "sentiment"],
optional_fields=["confidence", "aspects"],
example_format={"text": "I love this!", "sentiment": "positive"},
instructions="Classify sentiment as positive, negative, or neutral"
),
'named_entity_recognition': DatasetTemplate(
name="Named Entity Recognition",
description="Identify named entities",
task_type="ner",
required_fields=["text", "entities"],
optional_fields=["metadata"],
example_format={
"text": "John works at OpenAI",
"entities": [{"text": "John", "label": "PERSON"}]
},
instructions="Mark all named entities"
),
'question_answering': DatasetTemplate(
name="Question Answering",
description="Create Q&A pairs",
task_type="qa",
required_fields=["context", "question", "answer"],
optional_fields=["answer_start", "metadata"],
example_format={
"context": "The capital of France is Paris.",
"question": "What is the capital of France?",
"answer": "Paris"
},
instructions="Create meaningful questions and answers"
),
'summarization': DatasetTemplate(
name="Text Summarization",
description="Create summaries",
task_type="summarization",
required_fields=["text", "summary"],
optional_fields=["summary_type", "length"],
example_format={
"text": "Long article text...",
"summary": "Brief summary"
},
instructions="Write clear, concise summaries"
)
}
return templates
class DatasetExporter:
"""Export datasets in various formats"""
def __init__(self):
self.supported_formats = [
'json', 'csv', 'jsonl', 'huggingface_datasets'
]
def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
export_format: str, annotations: Dict[str, Any] = None) -> str:
"""Export dataset"""
try:
dataset_data = self._prepare_data(items, template, annotations)
if export_format == 'json':
return self._export_json(dataset_data)
elif export_format == 'csv':
return self._export_csv(dataset_data)
elif export_format == 'jsonl':
return self._export_jsonl(dataset_data)
elif export_format == 'huggingface_datasets':
return self._export_huggingface(dataset_data, template)
else:
raise ValueError(f"Unsupported format: {export_format}")
except Exception as e:
logger.error(f"Export failed: {e}")
raise
def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate,
annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
"""Prepare data according to template"""
dataset_data = []
for item in items:
data_point = {
'text': item.content,
'title': item.title,
'url': item.url,
'metadata': item.metadata
}
if annotations and item.id in annotations:
data_point.update(annotations[item.id])
formatted = self._format_for_template(data_point, template)
if formatted:
dataset_data.append(formatted)
return dataset_data
def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
"""Format data according to template"""
formatted = {}
for field in template.required_fields:
if field in data_point:
formatted[field] = data_point[field]
elif field == 'text' and 'content' in data_point:
formatted[field] = data_point['content']
else:
return None
for field in template.optional_fields:
if field in data_point:
formatted[field] = data_point[field]
return formatted
def _export_json(self, data: List[Dict[str, Any]]) -> str:
"""Export as JSON"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"dataset_{timestamp}.json"
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return filename
def _export_csv(self, data: List[Dict[str, Any]]) -> str:
"""Export as CSV"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"dataset_{timestamp}.csv"
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
return filename
def _export_jsonl(self, data: List[Dict[str, Any]]) -> str:
"""Export as JSONL"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"dataset_{timestamp}.jsonl"
with open(filename, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
return filename
def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
"""Export as HuggingFace Dataset"""
if not HAS_DATASETS:
raise ImportError("datasets library not available")
dataset = Dataset.from_list(data)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
dataset.save_to_disk(dataset_name)
return dataset_name
class DatasetStudio:
"""Main application orchestrator"""
def __init__(self):
self.scraper = WebScraperEngine()
self.processor = DataProcessor()
self.annotator = AnnotationEngine()
self.exporter = DatasetExporter()
# Application state
self.scraped_items = []
self.processed_items = []
self.current_project = None
self.annotation_state = {}
logger.info("βœ… DatasetStudio initialized successfully")
def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]:
"""Start new project"""
self.current_project = {
'name': project_name,
'template': template_type,
'created_at': datetime.now().isoformat(),
'id': str(uuid.uuid4())
}
self.scraped_items = []
self.processed_items = []
self.annotation_state = {}
logger.info(f"πŸ“‹ New project: {project_name}")
return self.current_project
def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
"""Scrape URLs"""
url_list = [url.strip() for url in urls if url.strip()]
if not url_list:
return 0, ["No valid URLs provided"]
logger.info(f"πŸ•·οΈ Scraping {len(url_list)} URLs")
self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)
success = len(self.scraped_items)
failed = len(url_list) - success
errors = []
if failed > 0:
errors.append(f"{failed} URLs failed")
logger.info(f"βœ… Scraped {success}, failed {failed}")
return success, errors
def process_data(self, options: Dict[str, bool]) -> int:
"""Process scraped data"""
if not self.scraped_items:
return 0
logger.info(f"βš™οΈ Processing {len(self.scraped_items)} items")
self.processed_items = self.processor.process_items(self.scraped_items, options)
logger.info(f"βœ… Processed {len(self.processed_items)} items")
return len(self.processed_items)
def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]:
"""Get data preview"""
items = self.processed_items or self.scraped_items
preview = []
for item in items[:num_items]:
preview.append({
'title': item.title,
'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content,
'word_count': item.word_count,
'quality_score': round(item.quality_score, 2),
'url': item.url
})
return preview
def get_data_statistics(self) -> Dict[str, Any]:
"""Get dataset statistics"""
items = self.processed_items or self.scraped_items
if not items:
return {}
word_counts = [item.word_count for item in items]
quality_scores = [item.quality_score for item in items]
return {
'total_items': len(items),
'avg_word_count': round(np.mean(word_counts)),
'avg_quality_score': round(np.mean(quality_scores), 2),
'word_count_range': [min(word_counts), max(word_counts)],
'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)],
'languages': list(set(item.language for item in items)),
'domains': list(set(urlparse(item.url).netloc for item in items))
}
def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str:
"""Export dataset"""
if not self.processed_items and not self.scraped_items:
raise ValueError("No data to export")
items = self.processed_items or self.scraped_items
template = self.annotator.templates.get(template_name)
if not template:
raise ValueError(f"Unknown template: {template_name}")
logger.info(f"πŸ“€ Exporting {len(items)} items")
return self.exporter.export_dataset(items, template, export_format, annotations)
def create_modern_interface():
"""Create the modern Gradio interface"""
# Initialize studio
studio = DatasetStudio()
# Custom CSS
css = """
.gradio-container { max-width: 1400px; margin: auto; }
.studio-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white; padding: 2rem; border-radius: 15px;
margin-bottom: 2rem; text-align: center;
}
.workflow-card {
background: #f8f9ff; border: 2px solid #e1e5ff;
border-radius: 12px; padding: 1.5rem; margin: 1rem 0;
}
.step-header {
font-size: 1.2em; font-weight: 600; color: #4c51bf;
margin-bottom: 1rem;
}
"""
project_state = gr.State({})
with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
# Header
gr.HTML("""
<div class="studio-header">
<h1>πŸš€ AI Dataset Studio</h1>
<p>Create high-quality training datasets without coding</p>
</div>
""")
with gr.Tabs() as main_tabs:
# Project Setup
with gr.Tab("🎯 Project Setup"):
gr.HTML('<div class="step-header">Step 1: Create Your Project</div>')
with gr.Row():
with gr.Column(scale=2):
project_name = gr.Textbox(
label="Project Name",
placeholder="My Dataset Project",
value="News Analysis Dataset"
)
template_choice = gr.Radio(
choices=[
("πŸ“Š Text Classification", "text_classification"),
("😊 Sentiment Analysis", "sentiment_analysis"),
("πŸ‘₯ Named Entity Recognition", "named_entity_recognition"),
("❓ Question Answering", "question_answering"),
("πŸ“ Text Summarization", "summarization")
],
label="Dataset Type",
value="text_classification"
)
create_project_btn = gr.Button("πŸš€ Create Project", variant="primary")
project_status = gr.Markdown("")
with gr.Column(scale=1):
gr.HTML("""
<div class="workflow-card">
<h3>πŸ’‘ Template Guide</h3>
<p><strong>Text Classification:</strong> Categorize content</p>
<p><strong>Sentiment Analysis:</strong> Analyze emotions</p>
<p><strong>Named Entity Recognition:</strong> Identify entities</p>
<p><strong>Question Answering:</strong> Create Q&A pairs</p>
<p><strong>Summarization:</strong> Generate summaries</p>
</div>
""")
# Data Collection
with gr.Tab("πŸ•·οΈ Data Collection"):
gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>')
with gr.Row():
with gr.Column(scale=2):
urls_input = gr.Textbox(
label="URLs to Scrape (one per line)",
placeholder="https://example.com/article1\nhttps://example.com/article2",
lines=8
)
scrape_btn = gr.Button("πŸš€ Start Scraping", variant="primary")
scraping_status = gr.Markdown("")
with gr.Column(scale=1):
collection_stats = gr.HTML("")
# Data Processing
with gr.Tab("βš™οΈ Data Processing"):
gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>')
with gr.Row():
with gr.Column(scale=2):
with gr.Row():
with gr.Column():
clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True)
quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True)
detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
with gr.Column():
add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
extract_entities = gr.Checkbox(label="πŸ‘₯ Entity Extraction", value=False)
process_btn = gr.Button("βš™οΈ Process Data", variant="primary")
processing_status = gr.Markdown("")
with gr.Column(scale=1):
processing_stats = gr.HTML("")
# Data Preview
with gr.Tab("πŸ‘€ Data Preview"):
gr.HTML('<div class="step-header">Step 4: Review Dataset</div>')
with gr.Row():
with gr.Column(scale=2):
refresh_btn = gr.Button("πŸ”„ Refresh Preview", variant="secondary")
data_preview = gr.DataFrame(
headers=["Title", "Content Preview", "Words", "Quality", "URL"],
label="Dataset Preview"
)
with gr.Column(scale=1):
dataset_stats = gr.JSON(label="Statistics")
# Export
with gr.Tab("πŸ“€ Export Dataset"):
gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>')
with gr.Row():
with gr.Column(scale=2):
export_format = gr.Radio(
choices=[
("πŸ“„ JSON", "json"),
("πŸ“Š CSV", "csv"),
("πŸ“‹ JSONL", "jsonl"),
("πŸ€— HuggingFace", "huggingface_datasets")
],
label="Export Format",
value="json"
)
export_template = gr.Dropdown(
choices=[
"text_classification",
"sentiment_analysis",
"named_entity_recognition",
"question_answering",
"summarization"
],
label="Template",
value="text_classification"
)
export_btn = gr.Button("πŸ“€ Export Dataset", variant="primary")
export_status = gr.Markdown("")
export_file = gr.File(label="Download", visible=False)
with gr.Column(scale=1):
gr.HTML("""
<div class="workflow-card">
<h3>πŸ“‹ Export Info</h3>
<p><strong>JSON:</strong> Universal format</p>
<p><strong>CSV:</strong> Excel compatible</p>
<p><strong>JSONL:</strong> Line-separated</p>
<p><strong>HuggingFace:</strong> ML ready</p>
</div>
""")
# Event handlers
def create_project(name, template):
if not name.strip():
return "❌ Please enter a project name", {}
project = studio.start_new_project(name.strip(), template)
status = f"""
βœ… **Project Created!**
**Name:** {project['name']}
**Type:** {template.replace('_', ' ').title()}
**ID:** {project['id'][:8]}...
πŸ‘‰ Next: Go to Data Collection tab
"""
return status, project
def scrape_urls_handler(urls_text, project, progress=gr.Progress()):
if not project:
return "❌ Create a project first", ""
urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
if not urls:
return "❌ No URLs provided", ""
def progress_callback(pct, msg):
progress(pct, desc=msg)
success, errors = studio.scrape_urls(urls, progress_callback)
if success > 0:
stats = f"""
<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
<h3>βœ… Scraping Complete</h3>
<p><strong>{success}</strong> items collected</p>
</div>
"""
status = f"""
βœ… **Scraping Complete!**
**Success:** {success} URLs
**Failed:** {len(urls) - success} URLs
πŸ‘‰ Next: Go to Data Processing tab
"""
return status, stats
else:
return f"❌ Scraping failed: {', '.join(errors)}", ""
def process_data_handler(clean, quality, language, sentiment, entities, project):
if not project:
return "❌ Create a project first", ""
if not studio.scraped_items:
return "❌ No data to process. Scrape URLs first.", ""
options = {
'clean_text': clean,
'quality_filter': quality,
'detect_language': language,
'add_sentiment': sentiment,
'extract_entities': entities
}
processed = studio.process_data(options)
if processed > 0:
stats = studio.get_data_statistics()
stats_html = f"""
<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
<h3>βš™οΈ Processing Complete</h3>
<p><strong>{processed}</strong> items processed</p>
<p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
</div>
"""
status = f"""
βœ… **Processing Complete!**
**Processed:** {processed} items
**Avg Quality:** {stats.get('avg_quality_score', 0)}
πŸ‘‰ Next: Check Data Preview tab
"""
return status, stats_html
else:
return "❌ No items passed filters", ""
def refresh_preview_handler(project):
if not project:
return None, {}
preview = studio.get_data_preview()
stats = studio.get_data_statistics()
if preview:
df_data = []
for item in preview:
df_data.append([
item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
item['content_preview'],
item['word_count'],
item['quality_score'],
item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
])
return df_data, stats
return None, {}
def export_handler(format_type, template, project):
if not project:
return "❌ Create a project first", None
if not studio.processed_items and not studio.scraped_items:
return "❌ No data to export", None
try:
filename = studio.export_dataset(template, format_type)
status = f"""
βœ… **Export Successful!**
**Format:** {format_type}
**File:** {filename}
πŸ“₯ Download link below
"""
return status, filename
except Exception as e:
return f"❌ Export failed: {str(e)}", None
# Connect events
create_project_btn.click(
fn=create_project,
inputs=[project_name, template_choice],
outputs=[project_status, project_state]
)
scrape_btn.click(
fn=scrape_urls_handler,
inputs=[urls_input, project_state],
outputs=[scraping_status, collection_stats]
)
process_btn.click(
fn=process_data_handler,
inputs=[clean_text, quality_filter, detect_language,
add_sentiment, extract_entities, project_state],
outputs=[processing_status, processing_stats]
)
refresh_btn.click(
fn=refresh_preview_handler,
inputs=[project_state],
outputs=[data_preview, dataset_stats]
)
export_btn.click(
fn=export_handler,
inputs=[export_format, export_template, project_state],
outputs=[export_status, export_file]
)
return interface
# Launch application
if __name__ == "__main__":
logger.info("πŸš€ Starting AI Dataset Studio...")
# Check features
features = []
if HAS_TRANSFORMERS:
features.append("βœ… AI Models")
else:
features.append("⚠️ Basic Processing")
if HAS_NLTK:
features.append("βœ… Advanced NLP")
else:
features.append("⚠️ Basic NLP")
if HAS_DATASETS:
features.append("βœ… HuggingFace Integration")
else:
features.append("⚠️ Standard Export")
logger.info(f"πŸ“Š Features: {' | '.join(features)}")
try:
# Test DatasetStudio
test_studio = DatasetStudio()
logger.info("βœ… DatasetStudio test passed")
interface = create_modern_interface()
logger.info("βœ… Interface created successfully")
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)
except Exception as e:
logger.error(f"❌ Failed to launch: {e}")
logger.error("πŸ’‘ Try: python app_minimal.py")
raise