""" AI Dataset Studio - Complete Application Fixed version with all classes properly defined """ import gradio as gr import pandas as pd import numpy as np import json import re import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin from datetime import datetime, timedelta import logging from typing import Dict, List, Tuple, Optional, Any from dataclasses import dataclass, asdict from pathlib import Path import uuid import hashlib import time from collections import defaultdict import io # Optional imports with fallbacks try: from transformers import pipeline, AutoTokenizer, AutoModel HAS_TRANSFORMERS = True except ImportError: HAS_TRANSFORMERS = False try: import nltk from nltk.tokenize import sent_tokenize, word_tokenize HAS_NLTK = True except ImportError: HAS_NLTK = False try: from datasets import Dataset, DatasetDict HAS_DATASETS = True except ImportError: HAS_DATASETS = False # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Download NLTK data if available if HAS_NLTK: try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: pass @dataclass class ScrapedItem: """Data class for scraped content""" id: str url: str title: str content: str metadata: Dict[str, Any] scraped_at: str word_count: int language: str = "en" quality_score: float = 0.0 labels: List[str] = None annotations: Dict[str, Any] = None def __post_init__(self): if self.labels is None: self.labels = [] if self.annotations is None: self.annotations = {} @dataclass class DatasetTemplate: """Template for dataset creation""" name: str description: str task_type: str required_fields: List[str] optional_fields: List[str] example_format: Dict[str, Any] instructions: str class SecurityValidator: """Security validation for URLs and content""" ALLOWED_SCHEMES = {'http', 'https'} BLOCKED_DOMAINS = { 'localhost', '127.0.0.1', '0.0.0.0', '192.168.', '10.', '172.16.', '172.17.', '172.18.', '172.19.', '172.20.', '172.21.', '172.22.', '172.23.', '172.24.', '172.25.', '172.26.', '172.27.', '172.28.', '172.29.', '172.30.', '172.31.' } @classmethod def validate_url(cls, url: str) -> Tuple[bool, str]: """Validate URL for security concerns""" try: parsed = urlparse(url) if parsed.scheme not in cls.ALLOWED_SCHEMES: return False, f"Invalid scheme: {parsed.scheme}" hostname = parsed.hostname or '' if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS): return False, "Access to internal networks not allowed" if not parsed.netloc: return False, "Invalid URL format" return True, "URL is valid" except Exception as e: return False, f"URL validation error: {str(e)}" class WebScraperEngine: """Advanced web scraping engine""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', }) def scrape_url(self, url: str) -> Optional[ScrapedItem]: """Scrape a single URL""" try: # Validate URL is_valid, validation_msg = SecurityValidator.validate_url(url) if not is_valid: raise ValueError(f"Security validation failed: {validation_msg}") # Fetch content response = self.session.get(url, timeout=15) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'html.parser') # Extract data title = self._extract_title(soup) content = self._extract_content(soup) metadata = self._extract_metadata(soup, response) # Create item item = ScrapedItem( id=str(uuid.uuid4()), url=url, title=title, content=content, metadata=metadata, scraped_at=datetime.now().isoformat(), word_count=len(content.split()), quality_score=self._assess_quality(content) ) return item except Exception as e: logger.error(f"Failed to scrape {url}: {e}") return None def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]: """Scrape multiple URLs""" results = [] total = len(urls) for i, url in enumerate(urls): if progress_callback: progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...") item = self.scrape_url(url) if item: results.append(item) time.sleep(1) # Rate limiting return results def _extract_title(self, soup: BeautifulSoup) -> str: """Extract page title""" title_tag = soup.find('title') if title_tag: return title_tag.get_text().strip() h1_tag = soup.find('h1') if h1_tag: return h1_tag.get_text().strip() return "Untitled" def _extract_content(self, soup: BeautifulSoup) -> str: """Extract main content""" # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): element.decompose() # Try content selectors content_selectors = [ 'article', 'main', '.content', '.post-content', '.entry-content', '.article-body' ] for selector in content_selectors: element = soup.select_one(selector) if element: text = element.get_text(separator=' ', strip=True) if len(text) > 200: return self._clean_text(text) # Fallback to body body = soup.find('body') if body: return self._clean_text(body.get_text(separator=' ', strip=True)) return self._clean_text(soup.get_text(separator=' ', strip=True)) def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]: """Extract metadata""" metadata = { 'domain': urlparse(response.url).netloc, 'status_code': response.status_code, 'extracted_at': datetime.now().isoformat() } # Extract meta tags for tag in ['description', 'keywords', 'author']: element = soup.find('meta', attrs={'name': tag}) if element: metadata[tag] = element.get('content', '') return metadata def _clean_text(self, text: str) -> str: """Clean extracted text""" text = re.sub(r'\s+', ' ', text) text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE) text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE) return text.strip() def _assess_quality(self, content: str) -> float: """Assess content quality""" if not content: return 0.0 score = 0.0 word_count = len(content.split()) if word_count >= 50: score += 0.4 elif word_count >= 20: score += 0.2 sentence_count = len(re.split(r'[.!?]+', content)) if sentence_count >= 3: score += 0.3 if re.search(r'[A-Z][a-z]+', content): score += 0.3 return min(score, 1.0) class DataProcessor: """Data processing pipeline""" def __init__(self): self.sentiment_analyzer = None self.ner_model = None self._load_models() def _load_models(self): """Load NLP models""" if not HAS_TRANSFORMERS: logger.warning("⚠️ Transformers not available") return try: self.sentiment_analyzer = pipeline( "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest" ) logger.info("✅ Sentiment model loaded") except Exception as e: logger.warning(f"⚠️ Could not load sentiment model: {e}") def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]: """Process scraped items""" processed = [] for item in items: try: # Clean text if options.get('clean_text', True): item.content = self._clean_text_advanced(item.content) # Quality filter if options.get('quality_filter', True) and item.quality_score < 0.3: continue # Add sentiment if options.get('add_sentiment', False) and self.sentiment_analyzer: sentiment = self._analyze_sentiment(item.content) item.metadata['sentiment'] = sentiment # Language detection if options.get('detect_language', True): item.language = self._detect_language(item.content) processed.append(item) except Exception as e: logger.error(f"Error processing item {item.id}: {e}") continue return processed def _clean_text_advanced(self, text: str) -> str: """Advanced text cleaning""" text = re.sub(r'http\S+|www\.\S+', '', text) text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'\s+', ' ', text) return text.strip() def _analyze_sentiment(self, text: str) -> Dict[str, Any]: """Analyze sentiment""" try: text_sample = text[:512] result = self.sentiment_analyzer(text_sample)[0] return { 'label': result['label'], 'score': result['score'] } except: return {'label': 'UNKNOWN', 'score': 0.0} def _detect_language(self, text: str) -> str: """Simple language detection""" if re.search(r'[а-яё]', text.lower()): return 'ru' elif re.search(r'[ñáéíóúü]', text.lower()): return 'es' return 'en' class AnnotationEngine: """Annotation tools for dataset creation""" def __init__(self): self.templates = self._load_templates() def _load_templates(self) -> Dict[str, DatasetTemplate]: """Load dataset templates""" templates = { 'text_classification': DatasetTemplate( name="Text Classification", description="Classify text into categories", task_type="classification", required_fields=["text", "label"], optional_fields=["confidence", "metadata"], example_format={"text": "Sample text", "label": "positive"}, instructions="Label each text with appropriate category" ), 'sentiment_analysis': DatasetTemplate( name="Sentiment Analysis", description="Analyze emotional tone", task_type="classification", required_fields=["text", "sentiment"], optional_fields=["confidence", "aspects"], example_format={"text": "I love this!", "sentiment": "positive"}, instructions="Classify sentiment as positive, negative, or neutral" ), 'named_entity_recognition': DatasetTemplate( name="Named Entity Recognition", description="Identify named entities", task_type="ner", required_fields=["text", "entities"], optional_fields=["metadata"], example_format={ "text": "John works at OpenAI", "entities": [{"text": "John", "label": "PERSON"}] }, instructions="Mark all named entities" ), 'question_answering': DatasetTemplate( name="Question Answering", description="Create Q&A pairs", task_type="qa", required_fields=["context", "question", "answer"], optional_fields=["answer_start", "metadata"], example_format={ "context": "The capital of France is Paris.", "question": "What is the capital of France?", "answer": "Paris" }, instructions="Create meaningful questions and answers" ), 'summarization': DatasetTemplate( name="Text Summarization", description="Create summaries", task_type="summarization", required_fields=["text", "summary"], optional_fields=["summary_type", "length"], example_format={ "text": "Long article text...", "summary": "Brief summary" }, instructions="Write clear, concise summaries" ) } return templates class DatasetExporter: """Export datasets in various formats""" def __init__(self): self.supported_formats = [ 'json', 'csv', 'jsonl', 'huggingface_datasets' ] def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate, export_format: str, annotations: Dict[str, Any] = None) -> str: """Export dataset""" try: dataset_data = self._prepare_data(items, template, annotations) if export_format == 'json': return self._export_json(dataset_data) elif export_format == 'csv': return self._export_csv(dataset_data) elif export_format == 'jsonl': return self._export_jsonl(dataset_data) elif export_format == 'huggingface_datasets': return self._export_huggingface(dataset_data, template) else: raise ValueError(f"Unsupported format: {export_format}") except Exception as e: logger.error(f"Export failed: {e}") raise def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate, annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]: """Prepare data according to template""" dataset_data = [] for item in items: data_point = { 'text': item.content, 'title': item.title, 'url': item.url, 'metadata': item.metadata } if annotations and item.id in annotations: data_point.update(annotations[item.id]) formatted = self._format_for_template(data_point, template) if formatted: dataset_data.append(formatted) return dataset_data def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]: """Format data according to template""" formatted = {} for field in template.required_fields: if field in data_point: formatted[field] = data_point[field] elif field == 'text' and 'content' in data_point: formatted[field] = data_point['content'] else: return None for field in template.optional_fields: if field in data_point: formatted[field] = data_point[field] return formatted def _export_json(self, data: List[Dict[str, Any]]) -> str: """Export as JSON""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.json" with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return filename def _export_csv(self, data: List[Dict[str, Any]]) -> str: """Export as CSV""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.csv" df = pd.DataFrame(data) df.to_csv(filename, index=False) return filename def _export_jsonl(self, data: List[Dict[str, Any]]) -> str: """Export as JSONL""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dataset_{timestamp}.jsonl" with open(filename, 'w', encoding='utf-8') as f: for item in data: f.write(json.dumps(item, ensure_ascii=False) + '\n') return filename def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str: """Export as HuggingFace Dataset""" if not HAS_DATASETS: raise ImportError("datasets library not available") dataset = Dataset.from_list(data) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}" dataset.save_to_disk(dataset_name) return dataset_name class DatasetStudio: """Main application orchestrator""" def __init__(self): self.scraper = WebScraperEngine() self.processor = DataProcessor() self.annotator = AnnotationEngine() self.exporter = DatasetExporter() # Application state self.scraped_items = [] self.processed_items = [] self.current_project = None self.annotation_state = {} logger.info("✅ DatasetStudio initialized successfully") def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]: """Start new project""" self.current_project = { 'name': project_name, 'template': template_type, 'created_at': datetime.now().isoformat(), 'id': str(uuid.uuid4()) } self.scraped_items = [] self.processed_items = [] self.annotation_state = {} logger.info(f"📋 New project: {project_name}") return self.current_project def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: """Scrape URLs""" url_list = [url.strip() for url in urls if url.strip()] if not url_list: return 0, ["No valid URLs provided"] logger.info(f"🕷️ Scraping {len(url_list)} URLs") self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) success = len(self.scraped_items) failed = len(url_list) - success errors = [] if failed > 0: errors.append(f"{failed} URLs failed") logger.info(f"✅ Scraped {success}, failed {failed}") return success, errors def process_data(self, options: Dict[str, bool]) -> int: """Process scraped data""" if not self.scraped_items: return 0 logger.info(f"⚙️ Processing {len(self.scraped_items)} items") self.processed_items = self.processor.process_items(self.scraped_items, options) logger.info(f"✅ Processed {len(self.processed_items)} items") return len(self.processed_items) def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]: """Get data preview""" items = self.processed_items or self.scraped_items preview = [] for item in items[:num_items]: preview.append({ 'title': item.title, 'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content, 'word_count': item.word_count, 'quality_score': round(item.quality_score, 2), 'url': item.url }) return preview def get_data_statistics(self) -> Dict[str, Any]: """Get dataset statistics""" items = self.processed_items or self.scraped_items if not items: return {} word_counts = [item.word_count for item in items] quality_scores = [item.quality_score for item in items] return { 'total_items': len(items), 'avg_word_count': round(np.mean(word_counts)), 'avg_quality_score': round(np.mean(quality_scores), 2), 'word_count_range': [min(word_counts), max(word_counts)], 'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)], 'languages': list(set(item.language for item in items)), 'domains': list(set(urlparse(item.url).netloc for item in items)) } def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str: """Export dataset""" if not self.processed_items and not self.scraped_items: raise ValueError("No data to export") items = self.processed_items or self.scraped_items template = self.annotator.templates.get(template_name) if not template: raise ValueError(f"Unknown template: {template_name}") logger.info(f"📤 Exporting {len(items)} items") return self.exporter.export_dataset(items, template, export_format, annotations) def create_modern_interface(): """Create the modern Gradio interface""" # Initialize studio studio = DatasetStudio() # Custom CSS css = """ .gradio-container { max-width: 1400px; margin: auto; } .studio-header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 15px; margin-bottom: 2rem; text-align: center; } .workflow-card { background: #f8f9ff; border: 2px solid #e1e5ff; border-radius: 12px; padding: 1.5rem; margin: 1rem 0; } .step-header { font-size: 1.2em; font-weight: 600; color: #4c51bf; margin-bottom: 1rem; } """ project_state = gr.State({}) with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface: # Header gr.HTML("""

🚀 AI Dataset Studio

Create high-quality training datasets without coding

""") with gr.Tabs() as main_tabs: # Project Setup with gr.Tab("🎯 Project Setup"): gr.HTML('
Step 1: Create Your Project
') with gr.Row(): with gr.Column(scale=2): project_name = gr.Textbox( label="Project Name", placeholder="My Dataset Project", value="News Analysis Dataset" ) template_choice = gr.Radio( choices=[ ("📊 Text Classification", "text_classification"), ("😊 Sentiment Analysis", "sentiment_analysis"), ("👥 Named Entity Recognition", "named_entity_recognition"), ("❓ Question Answering", "question_answering"), ("📝 Text Summarization", "summarization") ], label="Dataset Type", value="text_classification" ) create_project_btn = gr.Button("🚀 Create Project", variant="primary") project_status = gr.Markdown("") with gr.Column(scale=1): gr.HTML("""

💡 Template Guide

Text Classification: Categorize content

Sentiment Analysis: Analyze emotions

Named Entity Recognition: Identify entities

Question Answering: Create Q&A pairs

Summarization: Generate summaries

""") # Data Collection with gr.Tab("🕷️ Data Collection"): gr.HTML('
Step 2: Collect Your Data
') with gr.Row(): with gr.Column(scale=2): urls_input = gr.Textbox( label="URLs to Scrape (one per line)", placeholder="https://example.com/article1\nhttps://example.com/article2", lines=8 ) scrape_btn = gr.Button("🚀 Start Scraping", variant="primary") scraping_status = gr.Markdown("") with gr.Column(scale=1): collection_stats = gr.HTML("") # Data Processing with gr.Tab("⚙️ Data Processing"): gr.HTML('
Step 3: Clean & Enhance
') with gr.Row(): with gr.Column(scale=2): with gr.Row(): with gr.Column(): clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True) quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True) detect_language = gr.Checkbox(label="🌍 Language Detection", value=True) with gr.Column(): add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False) extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False) process_btn = gr.Button("⚙️ Process Data", variant="primary") processing_status = gr.Markdown("") with gr.Column(scale=1): processing_stats = gr.HTML("") # Data Preview with gr.Tab("👀 Data Preview"): gr.HTML('
Step 4: Review Dataset
') with gr.Row(): with gr.Column(scale=2): refresh_btn = gr.Button("🔄 Refresh Preview", variant="secondary") data_preview = gr.DataFrame( headers=["Title", "Content Preview", "Words", "Quality", "URL"], label="Dataset Preview" ) with gr.Column(scale=1): dataset_stats = gr.JSON(label="Statistics") # Export with gr.Tab("📤 Export Dataset"): gr.HTML('
Step 5: Export Your Dataset
') with gr.Row(): with gr.Column(scale=2): export_format = gr.Radio( choices=[ ("📄 JSON", "json"), ("📊 CSV", "csv"), ("📋 JSONL", "jsonl"), ("🤗 HuggingFace", "huggingface_datasets") ], label="Export Format", value="json" ) export_template = gr.Dropdown( choices=[ "text_classification", "sentiment_analysis", "named_entity_recognition", "question_answering", "summarization" ], label="Template", value="text_classification" ) export_btn = gr.Button("📤 Export Dataset", variant="primary") export_status = gr.Markdown("") export_file = gr.File(label="Download", visible=False) with gr.Column(scale=1): gr.HTML("""

📋 Export Info

JSON: Universal format

CSV: Excel compatible

JSONL: Line-separated

HuggingFace: ML ready

""") # Event handlers def create_project(name, template): if not name.strip(): return "❌ Please enter a project name", {} project = studio.start_new_project(name.strip(), template) status = f""" ✅ **Project Created!** **Name:** {project['name']} **Type:** {template.replace('_', ' ').title()} **ID:** {project['id'][:8]}... 👉 Next: Go to Data Collection tab """ return status, project def scrape_urls_handler(urls_text, project, progress=gr.Progress()): if not project: return "❌ Create a project first", "" urls = [url.strip() for url in urls_text.split('\n') if url.strip()] if not urls: return "❌ No URLs provided", "" def progress_callback(pct, msg): progress(pct, desc=msg) success, errors = studio.scrape_urls(urls, progress_callback) if success > 0: stats = f"""

✅ Scraping Complete

{success} items collected

""" status = f""" ✅ **Scraping Complete!** **Success:** {success} URLs **Failed:** {len(urls) - success} URLs 👉 Next: Go to Data Processing tab """ return status, stats else: return f"❌ Scraping failed: {', '.join(errors)}", "" def process_data_handler(clean, quality, language, sentiment, entities, project): if not project: return "❌ Create a project first", "" if not studio.scraped_items: return "❌ No data to process. Scrape URLs first.", "" options = { 'clean_text': clean, 'quality_filter': quality, 'detect_language': language, 'add_sentiment': sentiment, 'extract_entities': entities } processed = studio.process_data(options) if processed > 0: stats = studio.get_data_statistics() stats_html = f"""

⚙️ Processing Complete

{processed} items processed

Quality: {stats.get('avg_quality_score', 0)}

""" status = f""" ✅ **Processing Complete!** **Processed:** {processed} items **Avg Quality:** {stats.get('avg_quality_score', 0)} 👉 Next: Check Data Preview tab """ return status, stats_html else: return "❌ No items passed filters", "" def refresh_preview_handler(project): if not project: return None, {} preview = studio.get_data_preview() stats = studio.get_data_statistics() if preview: df_data = [] for item in preview: df_data.append([ item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], item['content_preview'], item['word_count'], item['quality_score'], item['url'][:50] + "..." if len(item['url']) > 50 else item['url'] ]) return df_data, stats return None, {} def export_handler(format_type, template, project): if not project: return "❌ Create a project first", None if not studio.processed_items and not studio.scraped_items: return "❌ No data to export", None try: filename = studio.export_dataset(template, format_type) status = f""" ✅ **Export Successful!** **Format:** {format_type} **File:** {filename} 📥 Download link below """ return status, filename except Exception as e: return f"❌ Export failed: {str(e)}", None # Connect events create_project_btn.click( fn=create_project, inputs=[project_name, template_choice], outputs=[project_status, project_state] ) scrape_btn.click( fn=scrape_urls_handler, inputs=[urls_input, project_state], outputs=[scraping_status, collection_stats] ) process_btn.click( fn=process_data_handler, inputs=[clean_text, quality_filter, detect_language, add_sentiment, extract_entities, project_state], outputs=[processing_status, processing_stats] ) refresh_btn.click( fn=refresh_preview_handler, inputs=[project_state], outputs=[data_preview, dataset_stats] ) export_btn.click( fn=export_handler, inputs=[export_format, export_template, project_state], outputs=[export_status, export_file] ) return interface # Launch application if __name__ == "__main__": logger.info("🚀 Starting AI Dataset Studio...") # Check features features = [] if HAS_TRANSFORMERS: features.append("✅ AI Models") else: features.append("⚠️ Basic Processing") if HAS_NLTK: features.append("✅ Advanced NLP") else: features.append("⚠️ Basic NLP") if HAS_DATASETS: features.append("✅ HuggingFace Integration") else: features.append("⚠️ Standard Export") logger.info(f"📊 Features: {' | '.join(features)}") try: # Test DatasetStudio test_studio = DatasetStudio() logger.info("✅ DatasetStudio test passed") interface = create_modern_interface() logger.info("✅ Interface created successfully") interface.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True ) except Exception as e: logger.error(f"❌ Failed to launch: {e}") logger.error("💡 Try: python app_minimal.py") raise