Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

MagicMeWizard commited on 5 days ago

Commit

6918f0f

verified ·

1 Parent(s): 6b9c591

Update app.py

Browse files

Files changed (1) hide show

app.py +929 -871

app.py CHANGED Viewed

@@ -1,1037 +1,1095 @@
 """
-AI Dataset Studio - Complete Application
-Fixed version with all classes properly defined
 """
 import gradio as gr
 import pandas as pd
-import numpy as np
 import json
 import re
-import requests
-from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
-from datetime import datetime, timedelta
-import logging
-from typing import Dict, List, Tuple, Optional, Any
 from dataclasses import dataclass, asdict
-from pathlib import Path
-import uuid
-import hashlib
-import time
-from collections import defaultdict
-import io
-# Optional imports with fallbacks
 try:
-    from transformers import pipeline, AutoTokenizer, AutoModel
-    HAS_TRANSFORMERS = True
-except ImportError:
-    HAS_TRANSFORMERS = False
 try:
     import nltk
-    from nltk.tokenize import sent_tokenize, word_tokenize
     HAS_NLTK = True
 except ImportError:
     HAS_NLTK = False
 try:
-    from datasets import Dataset, DatasetDict
-    HAS_DATASETS = True
 except ImportError:
-    HAS_DATASETS = False
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Download NLTK data if available
-if HAS_NLTK:
-    try:
-        nltk.download('punkt', quiet=True)
-        nltk.download('stopwords', quiet=True)
-        nltk.download('averaged_perceptron_tagger', quiet=True)
-    except:
-        pass
-@dataclass
-class ScrapedItem:
-    """Data class for scraped content"""
-    id: str
-    url: str
-    title: str
-    content: str
-    metadata: Dict[str, Any]
-    scraped_at: str
-    word_count: int
-    language: str = "en"
-    quality_score: float = 0.0
-    labels: List[str] = None
-    annotations: Dict[str, Any] = None
-    def __post_init__(self):
-        if self.labels is None:
-            self.labels = []
-        if self.annotations is None:
-            self.annotations = {}
-@dataclass
-class DatasetTemplate:
-    """Template for dataset creation"""
-    name: str
-    description: str
-    task_type: str
-    required_fields: List[str]
-    optional_fields: List[str]
-    example_format: Dict[str, Any]
-    instructions: str
-class SecurityValidator:
-    """Security validation for URLs and content"""
-    ALLOWED_SCHEMES = {'http', 'https'}
-    BLOCKED_DOMAINS = {
-        'localhost', '127.0.0.1', '0.0.0.0',
-        '192.168.', '10.', '172.16.', '172.17.',
-        '172.18.', '172.19.', '172.20.', '172.21.',
-        '172.22.', '172.23.', '172.24.', '172.25.',
-        '172.26.', '172.27.', '172.28.', '172.29.',
-        '172.30.', '172.31.'
-    }
-    @classmethod
-    def validate_url(cls, url: str) -> Tuple[bool, str]:
-        """Validate URL for security concerns"""
         try:
-            parsed = urlparse(url)
-            if parsed.scheme not in cls.ALLOWED_SCHEMES:
-                return False, f"Invalid scheme: {parsed.scheme}"
-            hostname = parsed.hostname or ''
-            if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
-                return False, "Access to internal networks not allowed"
-            if not parsed.netloc:
-                return False, "Invalid URL format"
-            return True, "URL is valid"
         except Exception as e:
-            return False, f"URL validation error: {str(e)}"
-class WebScraperEngine:
-    """Advanced web scraping engine"""
-    def __init__(self):
-        self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Connection': 'keep-alive',
-        })
-    def scrape_url(self, url: str) -> Optional[ScrapedItem]:
-        """Scrape a single URL"""
         try:
-            # Validate URL
-            is_valid, validation_msg = SecurityValidator.validate_url(url)
-            if not is_valid:
-                raise ValueError(f"Security validation failed: {validation_msg}")
-            # Fetch content
-            response = self.session.get(url, timeout=15)
-            response.raise_for_status()
-            # Parse HTML
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # Extract data
-            title = self._extract_title(soup)
-            content = self._extract_content(soup)
-            metadata = self._extract_metadata(soup, response)
-            # Create item
-            item = ScrapedItem(
-                id=str(uuid.uuid4()),
-                url=url,
-                title=title,
-                content=content,
-                metadata=metadata,
-                scraped_at=datetime.now().isoformat(),
-                word_count=len(content.split()),
-                quality_score=self._assess_quality(content)
             )
-            return item
-        except Exception as e:
-            logger.error(f"Failed to scrape {url}: {e}")
-            return None
-    def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
-        """Scrape multiple URLs"""
-        results = []
-        total = len(urls)
-        for i, url in enumerate(urls):
-            if progress_callback:
-                progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")
-            item = self.scrape_url(url)
-            if item:
-                results.append(item)
-            time.sleep(1)  # Rate limiting
-        return results
-    def _extract_title(self, soup: BeautifulSoup) -> str:
-        """Extract page title"""
-        title_tag = soup.find('title')
-        if title_tag:
-            return title_tag.get_text().strip()
-        h1_tag = soup.find('h1')
-        if h1_tag:
-            return h1_tag.get_text().strip()
-        return "Untitled"
-    def _extract_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content"""
-        # Remove unwanted elements
-        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
-            element.decompose()
-        # Try content selectors
-        content_selectors = [
-            'article', 'main', '.content', '.post-content',
-            '.entry-content', '.article-body'
-        ]
-        for selector in content_selectors:
-            element = soup.select_one(selector)
-            if element:
-                text = element.get_text(separator=' ', strip=True)
-                if len(text) > 200:
-                    return self._clean_text(text)
-        # Fallback to body
-        body = soup.find('body')
-        if body:
-            return self._clean_text(body.get_text(separator=' ', strip=True))
-        return self._clean_text(soup.get_text(separator=' ', strip=True))
-    def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
-        """Extract metadata"""
-        metadata = {
-            'domain': urlparse(response.url).netloc,
-            'status_code': response.status_code,
-            'extracted_at': datetime.now().isoformat()
-        }
-        # Extract meta tags
-        for tag in ['description', 'keywords', 'author']:
-            element = soup.find('meta', attrs={'name': tag})
-            if element:
-                metadata[tag] = element.get('content', '')
-        return metadata
-    def _clean_text(self, text: str) -> str:
-        """Clean extracted text"""
-        text = re.sub(r'\s+', ' ', text)
-        text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
-        text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
-        return text.strip()
-    def _assess_quality(self, content: str) -> float:
-        """Assess content quality"""
-        if not content:
-            return 0.0
-        score = 0.0
-        word_count = len(content.split())
-        if word_count >= 50:
-            score += 0.4
-        elif word_count >= 20:
-            score += 0.2
-        sentence_count = len(re.split(r'[.!?]+', content))
-        if sentence_count >= 3:
-            score += 0.3
-        if re.search(r'[A-Z][a-z]+', content):
-            score += 0.3
-        return min(score, 1.0)
-class DataProcessor:
-    """Data processing pipeline"""
-    def __init__(self):
-        self.sentiment_analyzer = None
-        self.ner_model = None
-        self._load_models()
-    def _load_models(self):
-        """Load NLP models"""
-        if not HAS_TRANSFORMERS:
-            logger.warning("⚠️ Transformers not available")
-            return
-        try:
-            self.sentiment_analyzer = pipeline(
-                "sentiment-analysis",
-                model="cardiffnlp/twitter-roberta-base-sentiment-latest"
-            )
-            logger.info("✅ Sentiment model loaded")
-        except Exception as e:
-            logger.warning(f"⚠️ Could not load sentiment model: {e}")
-    def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]:
-        """Process scraped items"""
-        processed = []
-        for item in items:
             try:
-                # Clean text
-                if options.get('clean_text', True):
-                    item.content = self._clean_text_advanced(item.content)
-                # Quality filter
-                if options.get('quality_filter', True) and item.quality_score < 0.3:
-                    continue
-                # Add sentiment
-                if options.get('add_sentiment', False) and self.sentiment_analyzer:
-                    sentiment = self._analyze_sentiment(item.content)
-                    item.metadata['sentiment'] = sentiment
-                # Language detection
-                if options.get('detect_language', True):
-                    item.language = self._detect_language(item.content)
-                processed.append(item)
             except Exception as e:
-                logger.error(f"Error processing item {item.id}: {e}")
                 continue
-        return processed
-    def _clean_text_advanced(self, text: str) -> str:
-        """Advanced text cleaning"""
-        text = re.sub(r'http\S+|www\.\S+', '', text)
-        text = re.sub(r'\S+@\S+', '', text)
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip()
-    def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
-        """Analyze sentiment"""
-        try:
-            text_sample = text[:512]
-            result = self.sentiment_analyzer(text_sample)[0]
-            return {
-                'label': result['label'],
-                'score': result['score']
-            }
-        except:
-            return {'label': 'UNKNOWN', 'score': 0.0}
-    def _detect_language(self, text: str) -> str:
-        """Simple language detection"""
-        if re.search(r'[а-яё]', text.lower()):
-            return 'ru'
-        elif re.search(r'[ñáéíóúü]', text.lower()):
-            return 'es'
-        return 'en'
-class AnnotationEngine:
-    """Annotation tools for dataset creation"""
-    def __init__(self):
-        self.templates = self._load_templates()
-    def _load_templates(self) -> Dict[str, DatasetTemplate]:
-        """Load dataset templates"""
-        templates = {
-            'text_classification': DatasetTemplate(
-                name="Text Classification",
-                description="Classify text into categories",
-                task_type="classification",
-                required_fields=["text", "label"],
-                optional_fields=["confidence", "metadata"],
-                example_format={"text": "Sample text", "label": "positive"},
-                instructions="Label each text with appropriate category"
-            ),
-            'sentiment_analysis': DatasetTemplate(
-                name="Sentiment Analysis",
-                description="Analyze emotional tone",
-                task_type="classification",
-                required_fields=["text", "sentiment"],
-                optional_fields=["confidence", "aspects"],
-                example_format={"text": "I love this!", "sentiment": "positive"},
-                instructions="Classify sentiment as positive, negative, or neutral"
-            ),
-            'named_entity_recognition': DatasetTemplate(
-                name="Named Entity Recognition",
-                description="Identify named entities",
-                task_type="ner",
-                required_fields=["text", "entities"],
-                optional_fields=["metadata"],
-                example_format={
-                    "text": "John works at OpenAI",
-                    "entities": [{"text": "John", "label": "PERSON"}]
-                },
-                instructions="Mark all named entities"
-            ),
-            'question_answering': DatasetTemplate(
-                name="Question Answering",
-                description="Create Q&A pairs",
-                task_type="qa",
-                required_fields=["context", "question", "answer"],
-                optional_fields=["answer_start", "metadata"],
-                example_format={
-                    "context": "The capital of France is Paris.",
-                    "question": "What is the capital of France?",
-                    "answer": "Paris"
-                },
-                instructions="Create meaningful questions and answers"
-            ),
-            'summarization': DatasetTemplate(
-                name="Text Summarization",
-                description="Create summaries",
-                task_type="summarization",
-                required_fields=["text", "summary"],
-                optional_fields=["summary_type", "length"],
-                example_format={
-                    "text": "Long article text...",
-                    "summary": "Brief summary"
-                },
-                instructions="Write clear, concise summaries"
-            )
-        }
-        return templates
-class DatasetExporter:
-    """Export datasets in various formats"""
-    def __init__(self):
-        self.supported_formats = [
-            'json', 'csv', 'jsonl', 'huggingface_datasets'
-        ]
-    def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
-                      export_format: str, annotations: Dict[str, Any] = None) -> str:
-        """Export dataset"""
-        try:
-            dataset_data = self._prepare_data(items, template, annotations)
-            if export_format == 'json':
-                return self._export_json(dataset_data)
-            elif export_format == 'csv':
-                return self._export_csv(dataset_data)
-            elif export_format == 'jsonl':
-                return self._export_jsonl(dataset_data)
-            elif export_format == 'huggingface_datasets':
-                return self._export_huggingface(dataset_data, template)
             else:
-                raise ValueError(f"Unsupported format: {export_format}")
-        except Exception as e:
-            logger.error(f"Export failed: {e}")
-            raise
-    def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate,
-                     annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
-        """Prepare data according to template"""
-        dataset_data = []
-        for item in items:
-            data_point = {
-                'text': item.content,
-                'title': item.title,
-                'url': item.url,
-                'metadata': item.metadata
-            }
-            if annotations and item.id in annotations:
-                data_point.update(annotations[item.id])
-            formatted = self._format_for_template(data_point, template)
-            if formatted:
-                dataset_data.append(formatted)
-        return dataset_data
-    def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
-        """Format data according to template"""
-        formatted = {}
-        for field in template.required_fields:
-            if field in data_point:
-                formatted[field] = data_point[field]
-            elif field == 'text' and 'content' in data_point:
-                formatted[field] = data_point['content']
-            else:
-                return None
-        for field in template.optional_fields:
-            if field in data_point:
-                formatted[field] = data_point[field]
-        return formatted
-    def _export_json(self, data: List[Dict[str, Any]]) -> str:
-        """Export as JSON"""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"dataset_{timestamp}.json"
-        with open(filename, 'w', encoding='utf-8') as f:
-            json.dump(data, f, indent=2, ensure_ascii=False)
-        return filename
-    def _export_csv(self, data: List[Dict[str, Any]]) -> str:
-        """Export as CSV"""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"dataset_{timestamp}.csv"
-        df = pd.DataFrame(data)
-        df.to_csv(filename, index=False)
-        return filename
-    def _export_jsonl(self, data: List[Dict[str, Any]]) -> str:
-        """Export as JSONL"""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"dataset_{timestamp}.jsonl"
-        with open(filename, 'w', encoding='utf-8') as f:
-            for item in data:
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
-        return filename
-    def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
-        """Export as HuggingFace Dataset"""
-        if not HAS_DATASETS:
-            raise ImportError("datasets library not available")
-        dataset = Dataset.from_list(data)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
-        dataset.save_to_disk(dataset_name)
-        return dataset_name
-class DatasetStudio:
-    """Main application orchestrator"""
-    def __init__(self):
-        self.scraper = WebScraperEngine()
-        self.processor = DataProcessor()
-        self.annotator = AnnotationEngine()
-        self.exporter = DatasetExporter()
-        # Application state
-        self.scraped_items = []
-        self.processed_items = []
-        self.current_project = None
-        self.annotation_state = {}
-        logger.info("✅ DatasetStudio initialized successfully")
-    def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]:
-        """Start new project"""
-        self.current_project = {
-            'name': project_name,
-            'template': template_type,
-            'created_at': datetime.now().isoformat(),
-            'id': str(uuid.uuid4())
-        }
-        self.scraped_items = []
-        self.processed_items = []
-        self.annotation_state = {}
-        logger.info(f"📋 New project: {project_name}")
-        return self.current_project
-    def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
-        """Scrape URLs"""
-        url_list = [url.strip() for url in urls if url.strip()]
-        if not url_list:
-            return 0, ["No valid URLs provided"]
-        logger.info(f"🕷️ Scraping {len(url_list)} URLs")
-        self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)
-        success = len(self.scraped_items)
-        failed = len(url_list) - success
-        errors = []
-        if failed > 0:
-            errors.append(f"{failed} URLs failed")
-        logger.info(f"✅ Scraped {success}, failed {failed}")
-        return success, errors
-    def process_data(self, options: Dict[str, bool]) -> int:
-        """Process scraped data"""
-        if not self.scraped_items:
-            return 0
-        logger.info(f"⚙️ Processing {len(self.scraped_items)} items")
-        self.processed_items = self.processor.process_items(self.scraped_items, options)
-        logger.info(f"✅ Processed {len(self.processed_items)} items")
-        return len(self.processed_items)
-    def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]:
-        """Get data preview"""
-        items = self.processed_items or self.scraped_items
-        preview = []
-        for item in items[:num_items]:
-            preview.append({
-                'title': item.title,
-                'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content,
-                'word_count': item.word_count,
-                'quality_score': round(item.quality_score, 2),
-                'url': item.url
-            })
-        return preview
-    def get_data_statistics(self) -> Dict[str, Any]:
-        """Get dataset statistics"""
-        items = self.processed_items or self.scraped_items
-        if not items:
-            return {}
-        word_counts = [item.word_count for item in items]
-        quality_scores = [item.quality_score for item in items]
-        return {
-            'total_items': len(items),
-            'avg_word_count': round(np.mean(word_counts)),
-            'avg_quality_score': round(np.mean(quality_scores), 2),
-            'word_count_range': [min(word_counts), max(word_counts)],
-            'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)],
-            'languages': list(set(item.language for item in items)),
-            'domains': list(set(urlparse(item.url).netloc for item in items))
-        }
-    def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str:
-        """Export dataset"""
-        if not self.processed_items and not self.scraped_items:
-            raise ValueError("No data to export")
-        items = self.processed_items or self.scraped_items
-        template = self.annotator.templates.get(template_name)
-        if not template:
-            raise ValueError(f"Unknown template: {template_name}")
-        logger.info(f"📤 Exporting {len(items)} items")
-        return self.exporter.export_dataset(items, template, export_format, annotations)
 def create_modern_interface():
     """Create the modern Gradio interface"""
-    # Initialize studio
     studio = DatasetStudio()
-    # Custom CSS
-    css = """
-    .gradio-container { max-width: 1400px; margin: auto; }
-    .studio-header {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white; padding: 2rem; border-radius: 15px;
-        margin-bottom: 2rem; text-align: center;
     }
-    .workflow-card {
-        background: #f8f9ff; border: 2px solid #e1e5ff;
-        border-radius: 12px; padding: 1.5rem; margin: 1rem 0;
     }
     .step-header {
-        font-size: 1.2em; font-weight: 600; color: #4c51bf;
-        margin-bottom: 1rem;
     }
-    """
-    project_state = gr.State({})
-    with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
-        # Header
         gr.HTML("""
-        <div class="studio-header">
             <h1>🚀 AI Dataset Studio</h1>
-            <p>Create high-quality training datasets without coding</p>
         </div>
         """)
-        with gr.Tabs() as main_tabs:
-            # Project Setup
-            with gr.Tab("🎯 Project Setup"):
-                gr.HTML('<div class="step-header">Step 1: Create Your Project</div>')
                 with gr.Row():
                     with gr.Column(scale=2):
                         project_name = gr.Textbox(
-                            label="Project Name",
-                            placeholder="My Dataset Project",
-                            value="News Analysis Dataset"
                         )
-                        template_choice = gr.Radio(
-                            choices=[
-                                ("📊 Text Classification", "text_classification"),
-                                ("😊 Sentiment Analysis", "sentiment_analysis"),
-                                ("👥 Named Entity Recognition", "named_entity_recognition"),
-                                ("❓ Question Answering", "question_answering"),
-                                ("📝 Text Summarization", "summarization")
-                            ],
-                            label="Dataset Type",
-                            value="text_classification"
                         )
-                        create_project_btn = gr.Button("🚀 Create Project", variant="primary")
-                        project_status = gr.Markdown("")
                     with gr.Column(scale=1):
-                        gr.HTML("""
-                        <div class="workflow-card">
-                            <h3>💡 Template Guide</h3>
-                            <p><strong>Text Classification:</strong> Categorize content</p>
-                            <p><strong>Sentiment Analysis:</strong> Analyze emotions</p>
-                            <p><strong>Named Entity Recognition:</strong> Identify entities</p>
-                            <p><strong>Question Answering:</strong> Create Q&A pairs</p>
-                            <p><strong>Summarization:</strong> Generate summaries</p>
-                        </div>
-                        """)
-            # Data Collection
-            with gr.Tab("🕷️ Data Collection"):
-                gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>')
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        urls_input = gr.Textbox(
-                            label="URLs to Scrape (one per line)",
-                            placeholder="https://example.com/article1\nhttps://example.com/article2",
-                            lines=8
-                        )
-                        scrape_btn = gr.Button("🚀 Start Scraping", variant="primary")
-                        scraping_status = gr.Markdown("")
-                    with gr.Column(scale=1):
-                        collection_stats = gr.HTML("")
-            # Data Processing
-            with gr.Tab("⚙️ Data Processing"):
-                gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>')
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        with gr.Row():
-                            with gr.Column():
-                                clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True)
-                                quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True)
-                                detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
-                            with gr.Column():
-                                add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
-                                extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)
-                        process_btn = gr.Button("⚙️ Process Data", variant="primary")
-                        processing_status = gr.Markdown("")
-                    with gr.Column(scale=1):
-                        processing_stats = gr.HTML("")
-            # Data Preview
-            with gr.Tab("👀 Data Preview"):
-                gr.HTML('<div class="step-header">Step 4: Review Dataset</div>')
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        refresh_btn = gr.Button("🔄 Refresh Preview", variant="secondary")
-                        data_preview = gr.DataFrame(
-                            headers=["Title", "Content Preview", "Words", "Quality", "URL"],
-                            label="Dataset Preview"
-                        )
-                    with gr.Column(scale=1):
-                        dataset_stats = gr.JSON(label="Statistics")
-            # Export
-            with gr.Tab("📤 Export Dataset"):
-                gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>')
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        export_format = gr.Radio(
-                            choices=[
-                                ("📄 JSON", "json"),
-                                ("📊 CSV", "csv"),
-                                ("📋 JSONL", "jsonl"),
-                                ("🤗 HuggingFace", "huggingface_datasets")
-                            ],
-                            label="Export Format",
-                            value="json"
-                        )
-                        export_template = gr.Dropdown(
-                            choices=[
-                                "text_classification",
-                                "sentiment_analysis",
-                                "named_entity_recognition",
-                                "question_answering",
-                                "summarization"
-                            ],
-                            label="Template",
-                            value="text_classification"
                         )
-                        export_btn = gr.Button("📤 Export Dataset", variant="primary")
-                        export_status = gr.Markdown("")
-                        export_file = gr.File(label="Download", visible=False)
-                    with gr.Column(scale=1):
-                        gr.HTML("""
-                        <div class="workflow-card">
-                            <h3>📋 Export Info</h3>
-                            <p><strong>JSON:</strong> Universal format</p>
-                            <p><strong>CSV:</strong> Excel compatible</p>
-                            <p><strong>JSONL:</strong> Line-separated</p>
-                            <p><strong>HuggingFace:</strong> ML ready</p>
-                        </div>
-                        """)
-        # Event handlers
-        def create_project(name, template):
-            if not name.strip():
-                return "❌ Please enter a project name", {}
-            project = studio.start_new_project(name.strip(), template)
-            status = f"""
-            ✅ **Project Created!**
-            **Name:** {project['name']}
-            **Type:** {template.replace('_', ' ').title()}
-            **ID:** {project['id'][:8]}...
-            👉 Next: Go to Data Collection tab
-            """
-            return status, project
-        def scrape_urls_handler(urls_text, project, progress=gr.Progress()):
-            if not project:
-                return "❌ Create a project first", ""
-            urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
-            if not urls:
-                return "❌ No URLs provided", ""
-            def progress_callback(pct, msg):
-                progress(pct, desc=msg)
-            success, errors = studio.scrape_urls(urls, progress_callback)
-            if success > 0:
-                stats = f"""
-                <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
-                    <h3>✅ Scraping Complete</h3>
-                    <p><strong>{success}</strong> items collected</p>
-                </div>
-                """
-                status = f"""
-                ✅ **Scraping Complete!**
-                **Success:** {success} URLs
-                **Failed:** {len(urls) - success} URLs
-                👉 Next: Go to Data Processing tab
-                """
-                return status, stats
-            else:
-                return f"❌ Scraping failed: {', '.join(errors)}", ""
-        def process_data_handler(clean, quality, language, sentiment, entities, project):
-            if not project:
-                return "❌ Create a project first", ""
-            if not studio.scraped_items:
-                return "❌ No data to process. Scrape URLs first.", ""
-            options = {
-                'clean_text': clean,
-                'quality_filter': quality,
-                'detect_language': language,
-                'add_sentiment': sentiment,
-                'extract_entities': entities
-            }
-            processed = studio.process_data(options)
-            if processed > 0:
-                stats = studio.get_data_statistics()
-                stats_html = f"""
-                <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
-                    <h3>⚙️ Processing Complete</h3>
-                    <p><strong>{processed}</strong> items processed</p>
-                    <p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
-                </div>
-                """
-                status = f"""
-                ✅ **Processing Complete!**
-                **Processed:** {processed} items
-                **Avg Quality:** {stats.get('avg_quality_score', 0)}
-                👉 Next: Check Data Preview tab
-                """
-                return status, stats_html
-            else:
-                return "❌ No items passed filters", ""
-        def refresh_preview_handler(project):
-            if not project:
-                return None, {}
-            preview = studio.get_data_preview()
-            stats = studio.get_data_statistics()
-            if preview:
-                df_data = []
-                for item in preview:
-                    df_data.append([
-                        item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
-                        item['content_preview'],
-                        item['word_count'],
-                        item['quality_score'],
-                        item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
-                    ])
-                return df_data, stats
-            return None, {}
-        def export_handler(format_type, template, project):
-            if not project:
-                return "❌ Create a project first", None
-            if not studio.processed_items and not studio.scraped_items:
-                return "❌ No data to export", None
-            try:
-                filename = studio.export_dataset(template, format_type)
-                status = f"""
-                ✅ **Export Successful!**
-                **Format:** {format_type}
-                **File:** {filename}
-                📥 Download link below
-                """
-                return status, filename
-            except Exception as e:
-                return f"❌ Export failed: {str(e)}", None
-        # Connect events
         create_project_btn.click(
-            fn=create_project,
-            inputs=[project_name, template_choice],
-            outputs=[project_status, project_state]
         )
         scrape_btn.click(
-            fn=scrape_urls_handler,
-            inputs=[urls_input, project_state],
-            outputs=[scraping_status, collection_stats]
         )
         process_btn.click(
-            fn=process_data_handler,
-            inputs=[clean_text, quality_filter, detect_language,
-                   add_sentiment, extract_entities, project_state],
-            outputs=[processing_status, processing_stats]
-        )
-        refresh_btn.click(
-            fn=refresh_preview_handler,
-            inputs=[project_state],
-            outputs=[data_preview, dataset_stats]
         )
         export_btn.click(
-            fn=export_handler,
-            inputs=[export_format, export_template, project_state],
-            outputs=[export_status, export_file]
         )
     return interface
-# Launch application
-if __name__ == "__main__":
     logger.info("🚀 Starting AI Dataset Studio...")
-    # Check features
-    features = []
-    if HAS_TRANSFORMERS:
-        features.append("✅ AI Models")
-    else:
-        features.append("⚠️ Basic Processing")
-    if HAS_NLTK:
-        features.append("✅ Advanced NLP")
-    else:
-        features.append("⚠️ Basic NLP")
-    if HAS_DATASETS:
-        features.append("✅ HuggingFace Integration")
-    else:
-        features.append("⚠️ Standard Export")
-    logger.info(f"📊 Features: {' | '.join(features)}")
-    try:
-        # Test DatasetStudio
-        test_studio = DatasetStudio()
-        logger.info("✅ DatasetStudio test passed")
-        interface = create_modern_interface()
-        logger.info("✅ Interface created successfully")
         interface.launch(
             server_name="0.0.0.0",
             server_port=7860,
             share=False,
             show_error=True
         )
-    except Exception as e:
-        logger.error(f"❌ Failed to launch: {e}")
-        logger.error("💡 Try: python app_minimal.py")
-        raise

 """
+🚀 AI Dataset Studio with Perplexity AI Integration
+A comprehensive platform for creating high-quality training datasets using AI-powered source discovery
 """
 import gradio as gr
 import pandas as pd
+import requests
 import json
+import logging
+import os
+import sys
+import time
 import re
+from datetime import datetime
+from typing import List, Dict, Optional, Tuple, Any
 from urllib.parse import urlparse, urljoin
 from dataclasses import dataclass, asdict
+import traceback
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Try to import required packages with fallbacks
 try:
+    from bs4 import BeautifulSoup
+    logger.info("✅ BeautifulSoup imported successfully")
+except ImportError as e:
+    logger.error("❌ Failed to import BeautifulSoup: %s", e)
+    sys.exit(1)
 try:
     import nltk
+    from nltk.corpus import stopwords
+    from nltk.tokenize import word_tokenize, sent_tokenize
+    logger.info("✅ NLTK imported successfully")
     HAS_NLTK = True
 except ImportError:
+    logger.warning("⚠️ NLTK not available - using basic text processing")
     HAS_NLTK = False
 try:
+    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+    import torch
+    logger.info("✅ Transformers imported successfully")
+    HAS_TRANSFORMERS = True
 except ImportError:
+    logger.warning("⚠️ Transformers not available - using extractive summaries")
+    HAS_TRANSFORMERS = False
+# Import Perplexity client
+try:
+    from perplexity_client import PerplexityClient, SearchType, SourceResult, SearchResults
+    logger.info("✅ Perplexity client imported successfully")
+    HAS_PERPLEXITY = True
+except ImportError:
+    logger.warning("⚠️ Perplexity client not available - manual source entry only")
+    HAS_PERPLEXITY = False
+# Dataset templates
+DATASET_TEMPLATES = {
+    "sentiment_analysis": {
+        "name": "📊 Sentiment Analysis",
+        "description": "Classify text as positive, negative, or neutral",
+        "fields": ["text", "sentiment"],
+        "example": {"text": "This product is amazing!", "sentiment": "positive"},
+        "search_queries": ["product reviews", "customer feedback", "social media posts", "movie reviews"]
+    },
+    "text_classification": {
+        "name": "📂 Text Classification",
+        "description": "Categorize text into predefined classes",
+        "fields": ["text", "category"],
+        "example": {"text": "Breaking: Stock market reaches new high", "category": "finance"},
+        "search_queries": ["news articles", "blog posts", "academic papers", "forum discussions"]
+    },
+    "named_entity_recognition": {
+        "name": "🏷️ Named Entity Recognition",
+        "description": "Identify people, places, organizations in text",
+        "fields": ["text", "entities"],
+        "example": {"text": "Apple Inc. was founded by Steve Jobs in California",
+                   "entities": [{"text": "Apple Inc.", "label": "ORG"}, {"text": "Steve Jobs", "label": "PERSON"}]},
+        "search_queries": ["news articles", "biographies", "company reports", "wikipedia articles"]
+    },
+    "question_answering": {
+        "name": "❓ Question Answering",
+        "description": "Extract answers from context passages",
+        "fields": ["context", "question", "answer"],
+        "example": {"context": "The capital of France is Paris", "question": "What is the capital of France?", "answer": "Paris"},
+        "search_queries": ["FAQ pages", "educational content", "interview transcripts", "knowledge bases"]
+    },
+    "text_summarization": {
+        "name": "📝 Text Summarization",
+        "description": "Generate concise summaries of longer texts",
+        "fields": ["text", "summary"],
+        "example": {"text": "Long article content...", "summary": "Brief summary of key points"},
+        "search_queries": ["news articles", "research papers", "blog posts", "reports"]
+    },
+    "translation": {
+        "name": "🌐 Translation",
+        "description": "Translate text between languages",
+        "fields": ["source_text", "target_text", "source_lang", "target_lang"],
+        "example": {"source_text": "Hello world", "target_text": "Hola mundo", "source_lang": "en", "target_lang": "es"},
+        "search_queries": ["multilingual websites", "international news", "translation datasets", "parallel corpora"]
+    }
+}
+class DatasetStudio:
+    """
+    🎯 Main Dataset Studio Class
+    Handles all core functionality for dataset creation
+    """
+    def __init__(self):
+        """Initialize the Dataset Studio"""
+        logger.info("🚀 Initializing AI Dataset Studio...")
+        # Initialize components
+        self.projects = {}
+        self.current_project = None
+        self.scraped_data = []
+        self.processed_data = []
+        # Initialize AI models if available
+        self.sentiment_analyzer = None
+        self.summarizer = None
+        self.ner_model = None
+        # Initialize Perplexity client
+        self.perplexity_client = None
+        if HAS_PERPLEXITY:
+            try:
+                api_key = os.getenv('PERPLEXITY_API_KEY')
+                if api_key:
+                    self.perplexity_client = PerplexityClient(api_key)
+                    logger.info("✅ Perplexity AI client initialized")
+                else:
+                    logger.warning("⚠️ PERPLEXITY_API_KEY not found - manual source entry only")
+            except Exception as e:
+                logger.error(f"❌ Failed to initialize Perplexity client: {e}")
+        self._load_models()
+        logger.info("✅ Dataset Studio initialized successfully")
+    def _load_models(self):
+        """Load AI models for processing"""
+        if not HAS_TRANSFORMERS:
+            logger.info("⚠️ Skipping model loading - transformers not available")
+            return
         try:
+            # Load sentiment analysis model
+            logger.info("📦 Loading sentiment analysis model...")
+            self.sentiment_analyzer = pipeline(
+                "sentiment-analysis",
+                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
+                return_all_scores=True
+            )
+            logger.info("✅ Sentiment analyzer loaded")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not load sentiment analyzer: {e}")
+        try:
+            # Load summarization model
+            logger.info("📦 Loading summarization model...")
+            self.summarizer = pipeline(
+                "summarization",
+                model="facebook/bart-large-cnn",
+                max_length=150,
+                min_length=30,
+                do_sample=False
+            )
+            logger.info("✅ Summarizer loaded")
         except Exception as e:
+            logger.warning(f"⚠️ Could not load summarizer: {e}")
         try:
+            # Load NER model
+            logger.info("📦 Loading NER model...")
+            self.ner_model = pipeline(
+                "ner",
+                model="dbmdz/bert-large-cased-finetuned-conll03-english",
+                aggregation_strategy="simple"
+            )
+            logger.info("✅ NER model loaded")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not load NER model: {e}")
+    def discover_sources_with_ai(
+        self,
+        project_description: str,
+        max_sources: int = 20,
+        search_type: str = "general",
+        include_academic: bool = True,
+        include_news: bool = True
+    ) -> Tuple[str, str]:
+        """
+        🧠 Discover sources using Perplexity AI
+        Args:
+            project_description: Description of the dataset project
+            max_sources: Maximum number of sources to find
+            search_type: Type of search (general, academic, news, etc.)
+            include_academic: Include academic sources
+            include_news: Include news sources
+        Returns:
+            Tuple of (status_message, sources_json)
+        """
+        if not self.perplexity_client:
+            return "❌ Perplexity AI not available. Please set PERPLEXITY_API_KEY environment variable.", "[]"
+        try:
+            logger.info(f"🔍 Discovering sources for: {project_description}")
+            # Map string to enum
+            search_type_enum = getattr(SearchType, search_type.upper(), SearchType.GENERAL)
+            # Discover sources
+            results = self.perplexity_client.discover_sources(
+                project_description=project_description,
+                search_type=search_type_enum,
+                max_sources=max_sources,
+                include_academic=include_academic,
+                include_news=include_news
             )
+            if not results.sources:
+                return "⚠️ No sources found. Try adjusting your search terms.", "[]"
+            # Format results for display
+            sources_data = []
+            for source in results.sources:
+                sources_data.append({
+                    "URL": source.url,
+                    "Title": source.title,
+                    "Description": source.description,
+                    "Type": source.source_type,
+                    "Domain": source.domain,
+                    "Quality Score": f"{source.relevance_score:.1f}/10"
+                })
+            status = f"✅ Found {len(results.sources)} sources in {results.search_time:.1f}s"
+            if results.suggestions:
+                status += f"\n💡 Suggestions: {', '.join(results.suggestions[:3])}"
+            return status, json.dumps(sources_data, indent=2)
+        except Exception as e:
+            logger.error(f"❌ Error discovering sources: {e}")
+            return f"❌ Error: {str(e)}", "[]"
+    def extract_urls_from_sources(self, sources_json: str) -> List[str]:
+        """Extract URLs from discovered sources JSON"""
+        try:
+            sources = json.loads(sources_json)
+            if isinstance(sources, list):
+                return [source.get("URL", "") for source in sources if source.get("URL")]
+            return []
+        except:
+            return []
+    def create_project(self, name: str, template: str, description: str) -> str:
+        """Create a new dataset project"""
+        if not name.strip():
+            return "❌ Please provide a project name"
+        project_id = f"project_{int(time.time())}"
+        self.projects[project_id] = {
+            "name": name,
+            "template": template,
+            "description": description,
+            "created_at": datetime.now().isoformat(),
+            "urls": [],
+            "data": [],
+            "processed_data": []
+        }
+        self.current_project = project_id
+        template_info = DATASET_TEMPLATES.get(template, {})
+        status = f"✅ Project '{name}' created successfully!\n"
+        status += f"📋 Template: {template_info.get('name', template)}\n"
+        status += f"📝 Description: {description}\n"
+        status += f"🆔 Project ID: {project_id}"
+        return status
+    def scrape_urls(self, urls_text: str, progress=gr.Progress()) -> Tuple[str, str]:
+        """Scrape content from provided URLs"""
+        if not self.current_project:
+            return "❌ Please create a project first", ""
+        # Parse URLs
+        urls = []
+        for line in urls_text.strip().split('\n'):
+            url = line.strip()
+            if url and self._is_valid_url(url):
+                urls.append(url)
+        if not urls:
+            return "❌ No valid URLs found", ""
+        scraped_data = []
+        failed_urls = []
+        progress(0, desc="Starting scraping...")
+        for i, url in enumerate(urls):
+            try:
+                progress((i + 1) / len(urls), desc=f"Scraping {i + 1}/{len(urls)}")
+                logger.info(f"🔍 Scraping: {url}")
+                # Make request
+                headers = {
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                }
+                response = requests.get(url, headers=headers, timeout=10)
+                response.raise_for_status()
+                # Parse content
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Extract text content
+                title = self._extract_title(soup)
+                content = self._extract_content(soup)
+                if content:
+                    scraped_data.append({
+                        'url': url,
+                        'title': title,
+                        'content': content,
+                        'length': len(content),
+                        'scraped_at': datetime.now().isoformat()
+                    })
+                    logger.info(f"✅ Scraped {len(content)} characters from {url}")
+                else:
+                    failed_urls.append(url)
+                    logger.warning(f"⚠️ No content extracted from {url}")
+                # Rate limiting
+                time.sleep(0.5)
+            except Exception as e:
+                failed_urls.append(url)
+                logger.error(f"❌ Failed to scrape {url}: {e}")
+        # Store results
+        self.projects[self.current_project]['urls'] = urls
+        self.projects[self.current_project]['data'] = scraped_data
+        self.scraped_data = scraped_data
+        # Create status message
+        status = f"✅ Scraping completed!\n"
+        status += f"📊 Successfully scraped: {len(scraped_data)} URLs\n"
+        status += f"❌ Failed: {len(failed_urls)} URLs\n"
+        status += f"📝 Total content: {sum(item['length'] for item in scraped_data):,} characters"
+        if failed_urls:
+            status += f"\n\nFailed URLs:\n" + "\n".join(f"• {url}" for url in failed_urls[:5])
+            if len(failed_urls) > 5:
+                status += f"\n... and {len(failed_urls) - 5} more"
+        # Create preview data
+        preview_data = []
+        for item in scraped_data[:10]:  # Show first 10
+            preview_data.append({
+                "Title": item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
+                "URL": item['url'],
+                "Length": f"{item['length']:,} chars",
+                "Preview": item['content'][:100] + "..." if len(item['content']) > 100 else item['content']
+            })
+        return status, json.dumps(preview_data, indent=2)
+    def process_data(self, template: str, progress=gr.Progress()) -> Tuple[str, str]:
+        """Process scraped data according to template"""
+        if not self.scraped_data:
+            return "❌ No scraped data available. Please scrape URLs first.", ""
+        template_config = DATASET_TEMPLATES.get(template, {})
+        if not template_config:
+            return f"❌ Unknown template: {template}", ""
+        processed_data = []
+        progress(0, desc="Starting data processing...")
+        for i, item in enumerate(self.scraped_data):
             try:
+                progress((i + 1) / len(self.scraped_data), desc=f"Processing {i + 1}/{len(self.scraped_data)}")
+                content = item['content']
+                # Process based on template
+                if template == "sentiment_analysis":
+                    processed_item = self._process_sentiment_analysis(item)
+                elif template == "text_classification":
+                    processed_item = self._process_text_classification(item)
+                elif template == "named_entity_recognition":
+                    processed_item = self._process_ner(item)
+                elif template == "question_answering":
+                    processed_item = self._process_qa(item)
+                elif template == "text_summarization":
+                    processed_item = self._process_summarization(item)
+                elif template == "translation":
+                    processed_item = self._process_translation(item)
+                else:
+                    processed_item = self._process_generic(item)
+                if processed_item:
+                    processed_data.extend(processed_item)
             except Exception as e:
+                logger.error(f"❌ Error processing item {i}: {e}")
                 continue
+        # Store processed data
+        self.processed_data = processed_data
+        if self.current_project:
+            self.projects[self.current_project]['processed_data'] = processed_data
+        # Create status
+        status = f"✅ Processing completed!\n"
+        status += f"📊 Generated {len(processed_data)} training examples\n"
+        status += f"📋 Template: {template_config['name']}\n"
+        status += f"🏷️ Fields: {', '.join(template_config['fields'])}"
+        # Create preview
+        preview_data = processed_data[:10] if processed_data else []
+        return status, json.dumps(preview_data, indent=2)
+    def _process_sentiment_analysis(self, item: Dict) -> List[Dict]:
+        """Process item for sentiment analysis"""
+        content = item['content']
+        # Split into sentences for more training examples
+        if HAS_NLTK:
+            try:
+                sentences = sent_tokenize(content)
+            except:
+                sentences = content.split('. ')
+        else:
+            sentences = content.split('. ')
+        results = []
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 10 or len(sentence) > 500:  # Filter by length
+                continue
+            # Use AI model if available
+            if self.sentiment_analyzer:
+                try:
+                    prediction = self.sentiment_analyzer(sentence)[0]
+                    # Map labels
+                    label_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral'}
+                    sentiment = label_map.get(prediction[0]['label'], 'neutral')
+                    confidence = prediction[0]['score']
+                    # Only include high-confidence predictions
+                    if confidence > 0.7:
+                        results.append({
+                            'text': sentence,
+                            'sentiment': sentiment,
+                            'confidence': confidence,
+                            'source_url': item['url']
+                        })
+                except Exception as e:
+                    logger.debug(f"Sentiment analysis failed: {e}")
+                    continue
             else:
+                # Fallback: keyword-based sentiment
+                sentiment = self._keyword_sentiment(sentence)
+                results.append({
+                    'text': sentence,
+                    'sentiment': sentiment,
+                    'source_url': item['url']
+                })
+        return results[:20]  # Limit per document
+    def _process_text_classification(self, item: Dict) -> List[Dict]:
+        """Process item for text classification"""
+        content = item['content']
+        # Extract domain-based category
+        url = item['url']
+        category = self._extract_category_from_url(url)
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50]
+        results = []
+        for paragraph in paragraphs[:10]:  # Limit per document
+            results.append({
+                'text': paragraph,
+                'category': category,
+                'source_url': url
+            })
+        return results
+    def _process_ner(self, item: Dict) -> List[Dict]:
+        """Process item for Named Entity Recognition"""
+        content = item['content']
+        if HAS_NLTK:
+            try:
+                sentences = sent_tokenize(content)
+            except:
+                sentences = content.split('. ')
+        else:
+            sentences = content.split('. ')
+        results = []
+        for sentence in sentences[:20]:  # Limit per document
+            sentence = sentence.strip()
+            if len(sentence) < 20:
+                continue
+            entities = []
+            if self.ner_model:
+                try:
+                    ner_results = self.ner_model(sentence)
+                    for entity in ner_results:
+                        entities.append({
+                            'text': entity['word'],
+                            'label': entity['entity_group'],
+                            'confidence': entity['score']
+                        })
+                except Exception as e:
+                    logger.debug(f"NER failed: {e}")
+            # Fallback: simple pattern matching
+            if not entities:
+                entities = self._simple_ner(sentence)
+            if entities:
+                results.append({
+                    'text': sentence,
+                    'entities': entities,
+                    'source_url': item['url']
+                })
+        return results
+    def _process_qa(self, item: Dict) -> List[Dict]:
+        """Process item for Question Answering"""
+        content = item['content']
+        # Generate simple Q&A pairs based on content
+        results = []
+        # Look for FAQ-style patterns
+        qa_patterns = [
+            (r'Q:\s*(.+?)\s*A:\s*(.+?)(?=Q:|$)', 'qa'),
+            (r'Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)', 'qa'),
+            (r'(.+\?)\s*(.+?)(?=.+\?|$)', 'simple')
+        ]
+        for pattern, style in qa_patterns:
+            matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
+            for match in matches[:10]:  # Limit per document
+                if len(match) == 2:
+                    question = match[0].strip()
+                    answer = match[1].strip()
+                    if len(question) > 10 and len(answer) > 10:
+                        results.append({
+                            'context': content[:500],  # First 500 chars as context
+                            'question': question,
+                            'answer': answer,
+                            'source_url': item['url']
+                        })
+        return results
+    def _process_summarization(self, item: Dict) -> List[Dict]:
+        """Process item for summarization"""
+        content = item['content']
+        # Split into chunks for summarization
+        chunk_size = 1000
+        chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+        results = []
+        for chunk in chunks[:5]:  # Limit per document
+            if len(chunk) < 100:
+                continue
+            summary = ""
+            if self.summarizer and len(chunk) > 100:
+                try:
+                    summary_result = self.summarizer(chunk, max_length=100, min_length=30)
+                    summary = summary_result[0]['summary_text']
+                except Exception as e:
+                    logger.debug(f"Summarization failed: {e}")
+            # Fallback: extractive summary
+            if not summary:
+                summary = self._extractive_summary(chunk)
+            if summary:
+                results.append({
+                    'text': chunk,
+                    'summary': summary,
+                    'source_url': item['url']
+                })
+        return results
+    def _process_translation(self, item: Dict) -> List[Dict]:
+        """Process item for translation (placeholder)"""
+        # This would require actual translation models
+        # For now, return empty to avoid errors
+        return []
+    def _process_generic(self, item: Dict) -> List[Dict]:
+        """Generic processing for unknown templates"""
+        content = item['content']
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50]
+        results = []
+        for paragraph in paragraphs[:10]:
+            results.append({
+                'text': paragraph,
+                'source_url': item['url']
+            })
+        return results
+    def export_dataset(self, format_type: str) -> Tuple[str, str]:
+        """Export processed dataset"""
+        if not self.processed_data:
+            return "❌ No processed data available", ""
+        try:
+            if format_type == "JSON":
+                data = json.dumps(self.processed_data, indent=2)
+                filename = f"dataset_{int(time.time())}.json"
+            elif format_type == "CSV":
+                df = pd.DataFrame(self.processed_data)
+                data = df.to_csv(index=False)
+                filename = f"dataset_{int(time.time())}.csv"
+            elif format_type == "HuggingFace Dataset":
+                # Format for HuggingFace datasets
+                hf_data = {
+                    "data": self.processed_data,
+                    "info": {
+                        "description": "AI Dataset Studio generated dataset",
+                        "created_at": datetime.now().isoformat(),
+                        "size": len(self.processed_data)
+                    }
+                }
+                data = json.dumps(hf_data, indent=2)
+                filename = f"hf_dataset_{int(time.time())}.json"
+            elif format_type == "JSONL":
+                lines = [json.dumps(item) for item in self.processed_data]
+                data = '\n'.join(lines)
+                filename = f"dataset_{int(time.time())}.jsonl"
+            else:
+                return "❌ Unsupported format", ""
+            # Save to temporary file for download
+            temp_path = f"/tmp/{filename}"
+            with open(temp_path, 'w', encoding='utf-8') as f:
+                f.write(data)
+            status = f"✅ Dataset exported successfully!\n"
+            status += f"📊 Records: {len(self.processed_data)}\n"
+            status += f"📁 Format: {format_type}\n"
+            status += f"📄 Size: {len(data):,} characters"
+            return status, temp_path
+        except Exception as e:
+            logger.error(f"Export failed: {e}")
+            return f"❌ Export failed: {str(e)}", ""
+    # Helper methods
+    def _is_valid_url(self, url: str) -> bool:
+        """Validate URL format"""
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except:
+            return False
+    def _extract_title(self, soup: BeautifulSoup) -> str:
+        """Extract title from HTML"""
+        title_tag = soup.find('title')
+        if title_tag:
+            return title_tag.get_text().strip()
+        h1_tag = soup.find('h1')
+        if h1_tag:
+            return h1_tag.get_text().strip()
+        return "Untitled"
+    def _extract_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from HTML"""
+        # Remove script and style elements
+        for script in soup(["script", "style", "nav", "footer", "header"]):
+            script.decompose()
+        # Try to find main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article'))
+        if main_content:
+            text = main_content.get_text()
+        else:
+            text = soup.get_text()
+        # Clean text
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text
+    def _keyword_sentiment(self, text: str) -> str:
+        """Simple keyword-based sentiment analysis"""
+        positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like']
+        negative_words = ['bad', 'terrible', 'awful', 'hate', 'dislike', 'horrible', 'worst']
+        text_lower = text.lower()
+        pos_count = sum(1 for word in positive_words if word in text_lower)
+        neg_count = sum(1 for word in negative_words if word in text_lower)
+        if pos_count > neg_count:
+            return 'positive'
+        elif neg_count > pos_count:
+            return 'negative'
+        else:
+            return 'neutral'
+    def _extract_category_from_url(self, url: str) -> str:
+        """Extract category based on URL domain/path"""
+        domain = urlparse(url).netloc.lower()
+        if any(news in domain for news in ['cnn', 'bbc', 'reuters', 'news']):
+            return 'news'
+        elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech']):
+            return 'technology'
+        elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business']):
+            return 'business'
+        elif any(sport in domain for sport in ['espn', 'sport']):
+            return 'sports'
+        else:
+            return 'general'
+    def _simple_ner(self, text: str) -> List[Dict]:
+        """Simple pattern-based NER"""
+        entities = []
+        # Capitalized words (potential names/places)
+        cap_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
+        for word in cap_words:
+            if len(word) > 2:
+                entities.append({
+                    'text': word,
+                    'label': 'MISC',
+                    'confidence': 0.5
+                })
+        return entities[:5]  # Limit results
+    def _extractive_summary(self, text: str) -> str:
+        """Simple extractive summarization"""
+        sentences = text.split('. ')
+        if len(sentences) <= 2:
+            return text
+        # Take first and last sentences
+        summary = f"{sentences[0]}. {sentences[-1]}"
+        return summary
 def create_modern_interface():
     """Create the modern Gradio interface"""
+    logger.info("🎨 Creating modern interface...")
+    # Initialize the studio
     studio = DatasetStudio()
+    # Custom CSS for modern look
+    custom_css = """
+    .gradio-container {
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
     }
+    .main-header {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 2rem;
+        border-radius: 10px;
+        margin-bottom: 2rem;
+        text-align: center;
     }
     .step-header {
+        background: linear-gradient(90deg, #4facfe 0%, #00f2fe 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 8px;
+        margin: 1rem 0;
+        font-weight: bold;
     }
+    .template-card {
+        border: 2px solid #e1e5e9;
+        border-radius: 10px;
+        padding: 1rem;
+        margin: 0.5rem;
+        transition: all 0.3s ease;
+    }
+    .template-card:hover {
+        border-color: #4facfe;
+        box-shadow: 0 4px 12px rgba(79, 172, 254, 0.3);
+    }
+    .status-success {
+        background-color: #d4edda;
+        border-color: #c3e6cb;
+        color: #155724;
+        padding: 1rem;
+        border-radius: 5px;
+        border-left: 4px solid #28a745;
+    }
+    .status-error {
+        background-color: #f8d7da;
+        border-color: #f5c6cb;
+        color: #721c24;
+        padding: 1rem;
+        border-radius: 5px;
+        border-left: 4px solid #dc3545;
+    }
+    """
+    with gr.Blocks(css=custom_css, title="🚀 AI Dataset Studio", theme=gr.themes.Soft()) as interface:
+        # Main header
         gr.HTML("""
+        <div class="main-header">
             <h1>🚀 AI Dataset Studio</h1>
+            <p>Create high-quality training datasets with AI-powered source discovery</p>
+            <p><strong>🧠 Powered by Perplexity AI • 🤖 Advanced NLP • 📊 Professional Export</strong></p>
         </div>
         """)
+        with gr.Tabs() as tabs:
+            # Tab 1: Project Setup
+            with gr.TabItem("1️⃣ Project Setup", id=0):
+                gr.HTML('<div class="step-header">📋 Step 1: Create Your Dataset Project</div>')
                 with gr.Row():
                     with gr.Column(scale=2):
                         project_name = gr.Textbox(
+                            label="🏷️ Project Name",
+                            placeholder="e.g., Customer Review Sentiment Analysis",
+                            info="Give your dataset project a descriptive name"
                         )
+                        project_description = gr.Textbox(
+                            label="📝 Project Description",
+                            lines=3,
+                            placeholder="Describe what kind of dataset you want to create...",
+                            info="This will be used by AI to discover relevant sources"
                         )
                     with gr.Column(scale=1):
+                        # Template selection
+                        template_choices = list(DATASET_TEMPLATES.keys())
+                        template_labels = [DATASET_TEMPLATES[t]["name"] for t in template_choices]
+                        template_selector = gr.Dropdown(
+                            choices=list(zip(template_labels, template_choices)),
+                            label="📊 Dataset Template",
+                            value=(template_labels[0], template_choices[0]),
+                            info="Choose the type of ML task"
                         )
+                        # Template info
+                        template_info = gr.Markdown("Select a template to see details")
+                create_project_btn = gr.Button("🎯 Create Project", variant="primary", size="lg")
+                project_status = gr.Textbox(label="📊 Project Status", interactive=False)
+                # Update template info when selection changes
+                def update_template_info(template_choice):
+                    if template_choice and len(template_choice) > 1:
+                        template_key = template_choice[1]
+                        template = DATASET_TEMPLATES.get(template_key, {})
+                        info = f"**{template.get('name', '')}**\n\n"
+                        info += f"📖 {template.get('description', '')}\n\n"
+                        info += f"🏷️ **Fields:** {', '.join(template.get('fields', []))}\n\n"
+                        info += f"💡 **Example:** `{template.get('example', {})}`"
+                        return info
+                    return "Select a template to see details"
+                template_selector.change(
+                    fn=update_template_info,
+                    inputs=[template_selector],
+                    outputs=[template_info]
+                )
+            # Tab 2: AI Source Discovery
+            with gr.TabItem("2️⃣ AI Source Discovery", id=1):
+                gr.HTML('<div class="step-header">🧠 Step 2: Discover Sources with Perplexity AI</div>')
+                if HAS_PERPLEXITY:
+                    gr.Markdown("""
+                    ✨ **AI-Powered Source Discovery** - Let Perplexity AI find the best sources for your dataset!
+                    Just describe your project and AI will discover relevant, high-quality sources automatically.
+                    """)
+                    with gr.Row():
+                        with gr.Column():
+                            ai_search_description = gr.Textbox(
+                                label="🎯 Project Description for AI Search",
+                                lines=3,
+                                placeholder="e.g., I need product reviews for sentiment analysis training data...",
+                                info="Describe what sources you need - be specific!"
+                            )
+                            with gr.Row():
+                                search_type = gr.Dropdown(
+                                    choices=["general", "academic", "news", "technical"],
+                                    value="general",
+                                    label="🔍 Search Type"
+                                )
+                                max_sources = gr.Slider(
+                                    minimum=5,
+                                    maximum=50,
+                                    value=20,
+                                    step=5,
+                                    label="📊 Max Sources"
+                                )
+                            with gr.Row():
+                                include_academic = gr.Checkbox(label="📚 Include Academic Sources", value=True)
+                                include_news = gr.Checkbox(label="📰 Include News Sources", value=True)
+                    discover_btn = gr.Button("🧠 Discover Sources with AI", variant="primary", size="lg")
+                    ai_search_status = gr.Textbox(label="🔍 Discovery Status", interactive=False)
+                    discovered_sources = gr.Code(label="📋 Discovered Sources", language="json", interactive=False)
+                    # Use discovered sources button
+                    use_ai_sources_btn = gr.Button("✅ Use These Sources", variant="secondary")
+                else:
+                    gr.Markdown("""
+                    ⚠️ **Perplexity AI Not Available**
+                    To enable AI-powered source discovery, set your `PERPLEXITY_API_KEY` environment variable.
+                    For now, you can manually enter URLs below.
+                    """)
+                    discovered_sources = gr.Code(value="[]", visible=False)
+                gr.HTML('<div class="step-header">📝 Manual URL Entry</div>')
+                urls_input = gr.Textbox(
+                    label="🔗 URLs to Scrape",
+                    lines=10,
+                    placeholder="https://example.com/article1\nhttps://example.com/article2\n...",
+                    info="Enter one URL per line"
+                )
+                scrape_btn = gr.Button("🕷️ Start Scraping", variant="primary", size="lg")
+                scrape_status = gr.Textbox(label="📊 Scraping Status", interactive=False)
+                scraped_preview = gr.Code(label="👀 Scraped Data Preview", language="json", interactive=False)
+            # Tab 3: Data Processing
+            with gr.TabItem("3️⃣ Data Processing", id=2):
+                gr.HTML('<div class="step-header">⚙️ Step 3: Process Data with AI</div>')
+                processing_template = gr.Dropdown(
+                    choices=list(zip(template_labels, template_choices)),
+                    label="📊 Processing Template",
+                    value=(template_labels[0], template_choices[0]),
+                    info="How should the data be processed?"
+                )
+                process_btn = gr.Button("⚙️ Process Data", variant="primary", size="lg")
+                process_status = gr.Textbox(label="📊 Processing Status", interactive=False)
+                processed_preview = gr.Code(label="🎯 Processed Data Preview", language="json", interactive=False)
+            # Tab 4: Export Dataset
+            with gr.TabItem("4️⃣ Export Dataset", id=3):
+                gr.HTML('<div class="step-header">📦 Step 4: Export Your Dataset</div>')
+                export_format = gr.Dropdown(
+                    choices=["JSON", "CSV", "HuggingFace Dataset", "JSONL"],
+                    value="JSON",
+                    label="📄 Export Format",
+                    info="Choose format for your dataset"
+                )
+                export_btn = gr.Button("📦 Export Dataset", variant="primary", size="lg")
+                export_status = gr.Textbox(label="📊 Export Status", interactive=False)
+                download_file = gr.File(label="💾 Download Dataset", interactive=False)
+        # Event handlers
         create_project_btn.click(
+            fn=lambda name, desc, template: studio.create_project(name, template[1] if template else "", desc),
+            inputs=[project_name, project_description, template_selector],
+            outputs=[project_status]
         )
+        if HAS_PERPLEXITY:
+            discover_btn.click(
+                fn=studio.discover_sources_with_ai,
+                inputs=[ai_search_description, max_sources, search_type, include_academic, include_news],
+                outputs=[ai_search_status, discovered_sources]
+            )
+            use_ai_sources_btn.click(
+                fn=lambda sources_json: '\n'.join(studio.extract_urls_from_sources(sources_json)),
+                inputs=[discovered_sources],
+                outputs=[urls_input]
+            )
         scrape_btn.click(
+            fn=studio.scrape_urls,
+            inputs=[urls_input],
+            outputs=[scrape_status, scraped_preview]
         )
         process_btn.click(
+            fn=lambda template: studio.process_data(template[1] if template else ""),
+            inputs=[processing_template],
+            outputs=[process_status, processed_preview]
         )
         export_btn.click(
+            fn=studio.export_dataset,
+            inputs=[export_format],
+            outputs=[export_status, download_file]
         )
+    logger.info("✅ Interface created successfully")
     return interface
+# Application startup
+try:
     logger.info("🚀 Starting AI Dataset Studio...")
+    logger.info("📊 Features: ✅ AI Models | ✅ Advanced NLP | ✅ HuggingFace Integration")
+    interface = create_modern_interface()
+    logger.info("✅ Application startup successful")
+    if __name__ == "__main__":
         interface.launch(
             server_name="0.0.0.0",
             server_port=7860,
             share=False,
             show_error=True
         )
+except Exception as e:
+    logger.error(f"❌ Failed to launch application: {e}")
+    logger.error(f"Traceback: {traceback.format_exc()}")
+    sys.exit(1)