""" AI Dataset Studio - Complete Application Fixed version with all classes properly defined """ import gradio as gr import pandas as pd import numpy as np import json import re import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin from datetime import datetime, timedelta import logging from typing import Dict, List, Tuple, Optional, Any from dataclasses import dataclass, asdict from pathlib import Path import uuid import hashlib import time from collections import defaultdict import io # Optional imports with fallbacks try: from transformers import pipeline, AutoTokenizer, AutoModel HAS_TRANSFORMERS = True except ImportError: HAS_TRANSFORMERS = False try: import nltk from nltk.tokenize import sent_tokenize, word_tokenize HAS_NLTK = True except ImportError: HAS_NLTK = False try: from datasets import Dataset, DatasetDict HAS_DATASETS = True except ImportError: HAS_DATASETS = False # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Download NLTK data if available if HAS_NLTK: try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: pass @dataclass class ScrapedItem: """Data class for scraped content""" id: str url: str title: str content: str metadata: Dict[str, Any] scraped_at: str word_count: int language: str = "en" quality_score: float = 0.0 labels: List[str] = None annotations: Dict[str, Any] = None def __post_init__(self): if self.labels is None: self.labels = [] if self.annotations is None: self.annotations = {} @dataclass class DatasetTemplate: """Template for dataset creation""" name: str description: str task_type: str required_fields: List[str] optional_fields: List[str] example_format: Dict[str, Any] instructions: str class SecurityValidator: """Security validation for URLs and content""" ALLOWED_SCHEMES = {'http', 'https'} BLOCKED_DOMAINS = { 'localhost', '127.0.0.1', '0.0.0.0', '192.168.', '10.', '172.16.', '172.17.', '172.18.', '172.19.', '172.20.', '172.21.', '172.22.', '172.23.', '172.24.', '172.25.', '172.26.', '172.27.', '172.28.', '172.29.', '172.30.', '172.31.' } @classmethod def validate_url(cls, url: str) -> Tuple[bool, str]: """Validate URL for security concerns""" try: parsed = urlparse(url) if parsed.scheme not in cls.ALLOWED_SCHEMES: return False, f"Invalid scheme: {parsed.scheme}" hostname = parsed.hostname or '' if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS): return False, "Access to internal networks not allowed" if not parsed.netloc: return False, "Invalid URL format" return True, "URL is valid" except Exception as e: return False, f"URL validation error: {str(e)}" class WebScraperEngine: """Advanced web scraping engine""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', }) def scrape_url(self, url: str) -> Optional[ScrapedItem]: """Scrape a single URL""" try: # Validate URL is_valid, validation_msg = SecurityValidator.validate_url(url) if not is_valid: raise ValueError(f"Security validation failed: {validation_msg}") # Fetch content response = self.session.get(url, timeout=15) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'html.parser') # Extract data title = self._extract_title(soup) content = self._extract_content(soup) metadata = self._extract_metadata(soup, response) # Create item item = ScrapedItem( id=str(uuid.uuid4()), url=url, title=title, content=content, metadata=metadata, scraped_at=datetime.now().isoformat(), word_count=len(content.split()), quality_score=self._assess_quality(content) ) return item except Exception as e: logger.error(f"Failed to scrape {url}: {e}") return None def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]: """Scrape multiple URLs""" results = [] total = len(urls) for i, url in enumerate(urls): if progress_callback: progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...") item = self.scrape_url(url) if item: results.append(item) time.sleep(1) # Rate limiting return results def _extract_title(self, soup: BeautifulSoup) -> str: """Extract page title""" title_tag = soup.find('title') if title_tag: return title_tag.get_text().strip() h1_tag = soup.find('h1') if h1_tag: return h1_tag.get_text().strip() return "Untitled" def _extract_content(self, soup: BeautifulSoup) -> str: """Extract main content""" # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): element.decompose() # Try content selectors content_selectors = [ 'article', 'main', '.content', '.post-content', '.entry-content', '.article-body' ] for selector in content_selectors: element = soup.select_one(selector) if element: text = element.get_text(separator=' ', strip=True) if len(text) > 200: return self._clean_text(text) # Fallback to body body = soup.find('body') if body: return self._clean_text(body.get_text(separator=' ', strip=True)) return self._clean_text(soup.get_text(separator=' ', strip=True)) def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]: """Extract metadata""" metadata = { 'domain': urlparse(response.url).netloc, 'status_code': response.status_code, 'extracted_at': datetime.now().isoformat() } # Extract meta tags for tag in ['description', 'keywords', 'author']: element = soup.find('meta', attrs={'name': tag}) if element: metadata[tag] = element.get('content', '') return metadata def _clean_text(self, text: str) -> str: """Clean extracted text""" text = re.sub(r'\s+', ' ', text) text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE) text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE) return text.strip() def _assess_quality(self, content: str) -> float: """Assess content quality""" if not content: return 0.0 score = 0.0 word_count = len(content.split()) if word_count >= 50: score += 0.4 elif word_count >= 20: score += 0.2 sentence_count = len(re.split(r'[.!?]+', content)) if sentence_count >= 3: score += 0.3 if re.search(r'[A-Z][a-z]+', content): score += 0.3 return min(score, 1.0) class DataProcessor: """Data processing pipeline""" def __init__(self): self.sentiment_analyzer = None self.ner_model = None self._load_models() def _load_models(self): """Load NLP models""" if not HAS_TRANSFORMERS: logger.warning("⚠️ Transformers not available") return try: self.sentiment_analyzer = pipeline( "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest" ) logger.info("✅ Sentiment model loaded") except Exception as e: logger.warning(f"⚠️ Could not load sentiment model: {e}") def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]: """Process scraped items""" processed = [] for item in items: try: # Clean text if options.get('clean_text', True): item.content = self._clean_text_advanced(item.content) # Quality filter if options.get('quality_filter', True) and item.quality_score < 0.3: continue # Add sentiment if options.get('add_sentiment', False) and self.sentiment_analyzer: sentiment = self._analyze_sentiment(item.content) item.metadata['sentiment'] = sentiment # Language detection if options.get('detect_language', True): item.language = self._detect_language(item.content) processed.append(item) except Exception as e: logger.error(f"Error processing item {item.id}: {e}") continue return processed def _clean_text_advanced(self, text: str) -> str: """Advanced text cleaning""" text = re.sub(r'http\S+|www\.\S+', '', text) text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'\s+', ' ', text) return text.strip() def _analyze_sentiment(self, text: str) -> Dict[str, Any]: """Analyze sentiment""" try: text_sample = text[:512] result = self.sentiment_analyzer(text_sample)[0] return { 'label': result['label'], 'score': result['score'] } except: return {'label': 'UNKNOWN', 'score': 0.0} def _detect_language(self, text: str) -> str: """Simple language detection""" if re.search(r'[а-яё]', text.lower()): return 'ru' elif re.search(r'[ñáéíóúü]', text.lower()): return 'es' return 'en' class AnnotationEngine: """Annotation tools for dataset creation""" def __init__(self): self.templates = self._load_templates() def _load_templates(self) -> Dict[str, DatasetTemplate]: """Load dataset templates""" templates = { 'text_classification': DatasetTemplate( name="Text Classification", description="Classify text into categories", task_type="classification", required_fields=["text", "label"], optional_fields=["confidence", "metadata"], example_format={"text": "Sample text", "label": "positive"}, instructions="Label each text with appropriate category" ), 'sentiment_analysis': DatasetTemplate( name="Sentiment Analysis", description="Analyze emotional tone", task_type="classification", required_fields=["text", "sentiment"], optional_fields=["confidence", "aspects"], example_format={"text": "I love this!", "sentiment": "positive"}, instructions="Classify sentiment as positive, negative, or neutral" ), 'named_entity_recognition': DatasetTemplate( name="Named Entity Recognition", description="Identify named entities", task_type="ner", required_fields=["text", "entities"], optional_fields=["metadata"], example_format={ "text": "John works at OpenAI", "entities": [{"text": "John", "label": "PERSON"}] }, instructions="Mark all named entities" ), 'question_answering': DatasetTemplate( name="Question Answering", description="Create Q&A pairs", task_type="qa", required_fields=["context", "question", "answer"], optional_fields=["answer_start", "metadata"], example_format={ "context": "The capital of France is Paris.", "question": "What is the capital of France?", "answer": "Paris" }, instructions="Create meaningful questions and answers" ), 'summarization': DatasetTemplate( name="Text Summarization", description="Create summaries", task_type="summarization", required_fields=["text", "summary"], optional_fields=["summary_type", "length"], example_format={ "text": "Long article text...", "summary": "Brief summary" }, instructions="Write clear, concise summaries" ) } return templates class DatasetExporter: """Export datasets in various formats""" def __init__(self): self.supported_formats = [ 'json', 'csv', 'jsonl', 'huggingface_datasets' ] def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate, export_format: str, annotations: Dict[str, Any] = None) -> str: """Export dataset""" try: dataset_data = self._prepare_data(items, template, annotations) if export_format == 'json': return self._export_json(dataset_data) elif export_format == 'csv': return self._export_csv(dataset_data) elif export_format == 'jsonl': return self._export_jsonl(dataset_data) elif export_format == 'huggingface_datasets': return self._export_huggingface(dataset_data, template) else: raise ValueError(f"Unsupported format: {export_format}") except Exception as e: logger.error(f"Export failed: {e}") raise def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate, annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]: """Prepare data according to template""" dataset_data = [] for item in items: data_point = { 'text': item.content, 'title': item.title, 'url': item.url, 'metadata': item.metadata } if annotations and item.id in annotations: data_point.update(annotations[item.id]) formatted = self._format_for_template(data_point, template) if formatted: dataset_data.append(formatted) return dataset_data def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]: """Format data according to template""" formatted = {} for field in template.required_fields: if field in data_point: formatted[field] = data_point[field] elif field == 'text' and 'content' in data_point: formatted[field] = data_point['content'] else: return None for field in template.optional_fields: if field in data_point: formatted[field] = data_point[field] return formatted def _export_json(self, data: List[Dict[str, Any]]) -> str: """Export as JSON""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.json" with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return filename def _export_csv(self, data: List[Dict[str, Any]]) -> str: """Export as CSV""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.csv" df = pd.DataFrame(data) df.to_csv(filename, index=False) return filename def _export_jsonl(self, data: List[Dict[str, Any]]) -> str: """Export as JSONL""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.jsonl" with open(filename, 'w', encoding='utf-8') as f: for item in data: f.write(json.dumps(item, ensure_ascii=False) + '\n') return filename def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str: """Export as HuggingFace Dataset""" if not HAS_DATASETS: raise ImportError("datasets library not available") dataset = Dataset.from_list(data) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}" dataset.save_to_disk(dataset_name) return dataset_name class DatasetStudio: """Main application orchestrator""" def __init__(self): self.scraper = WebScraperEngine() self.processor = DataProcessor() self.annotator = AnnotationEngine() self.exporter = DatasetExporter() # Application state self.scraped_items = [] self.processed_items = [] self.current_project = None self.annotation_state = {} logger.info("✅ DatasetStudio initialized successfully") def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]: """Start new project""" self.current_project = { 'name': project_name, 'template': template_type, 'created_at': datetime.now().isoformat(), 'id': str(uuid.uuid4()) } self.scraped_items = [] self.processed_items = [] self.annotation_state = {} logger.info(f"📋 New project: {project_name}") return self.current_project def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: """Scrape URLs""" url_list = [url.strip() for url in urls if url.strip()] if not url_list: return 0, ["No valid URLs provided"] logger.info(f"🕷️ Scraping {len(url_list)} URLs") self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) success = len(self.scraped_items) failed = len(url_list) - success errors = [] if failed > 0: errors.append(f"{failed} URLs failed") logger.info(f"✅ Scraped {success}, failed {failed}") return success, errors def process_data(self, options: Dict[str, bool]) -> int: """Process scraped data""" if not self.scraped_items: return 0 logger.info(f"⚙️ Processing {len(self.scraped_items)} items") self.processed_items = self.processor.process_items(self.scraped_items, options) logger.info(f"✅ Processed {len(self.processed_items)} items") return len(self.processed_items) def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]: """Get data preview""" items = self.processed_items or self.scraped_items preview = [] for item in items[:num_items]: preview.append({ 'title': item.title, 'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content, 'word_count': item.word_count, 'quality_score': round(item.quality_score, 2), 'url': item.url }) return preview def get_data_statistics(self) -> Dict[str, Any]: """Get dataset statistics""" items = self.processed_items or self.scraped_items if not items: return {} word_counts = [item.word_count for item in items] quality_scores = [item.quality_score for item in items] return { 'total_items': len(items), 'avg_word_count': round(np.mean(word_counts)), 'avg_quality_score': round(np.mean(quality_scores), 2), 'word_count_range': [min(word_counts), max(word_counts)], 'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)], 'languages': list(set(item.language for item in items)), 'domains': list(set(urlparse(item.url).netloc for item in items)) } def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str: """Export dataset""" if not self.processed_items and not self.scraped_items: raise ValueError("No data to export") items = self.processed_items or self.scraped_items template = self.annotator.templates.get(template_name) if not template: raise ValueError(f"Unknown template: {template_name}") logger.info(f"📤 Exporting {len(items)} items") return self.exporter.export_dataset(items, template, export_format, annotations) def create_modern_interface(): """Create the modern Gradio interface""" # Initialize studio studio = DatasetStudio() # Custom CSS css = """ .gradio-container { max-width: 1400px; margin: auto; } .studio-header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 15px; margin-bottom: 2rem; text-align: center; } .workflow-card { background: #f8f9ff; border: 2px solid #e1e5ff; border-radius: 12px; padding: 1.5rem; margin: 1rem 0; } .step-header { font-size: 1.2em; font-weight: 600; color: #4c51bf; margin-bottom: 1rem; } """ project_state = gr.State({}) with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface: # Header gr.HTML("""
Create high-quality training datasets without coding
Text Classification: Categorize content
Sentiment Analysis: Analyze emotions
Named Entity Recognition: Identify entities
Question Answering: Create Q&A pairs
Summarization: Generate summaries
JSON: Universal format
CSV: Excel compatible
JSONL: Line-separated
HuggingFace: ML ready
{success} items collected
{processed} items processed
Quality: {stats.get('avg_quality_score', 0)}