|
""" |
|
π AI Dataset Studio with Perplexity AI Integration |
|
A comprehensive platform for creating high-quality training datasets using AI-powered source discovery |
|
""" |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import requests |
|
import json |
|
import logging |
|
import os |
|
import sys |
|
import time |
|
import re |
|
from datetime import datetime |
|
from typing import List, Dict, Optional, Tuple, Any |
|
from urllib.parse import urlparse, urljoin |
|
from dataclasses import dataclass, asdict |
|
import traceback |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
try: |
|
from bs4 import BeautifulSoup |
|
logger.info("β
BeautifulSoup imported successfully") |
|
except ImportError as e: |
|
logger.error("β Failed to import BeautifulSoup: %s", e) |
|
sys.exit(1) |
|
|
|
try: |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
logger.info("β
NLTK imported successfully") |
|
HAS_NLTK = True |
|
except ImportError: |
|
logger.warning("β οΈ NLTK not available - using basic text processing") |
|
HAS_NLTK = False |
|
|
|
try: |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
logger.info("β
Transformers imported successfully") |
|
HAS_TRANSFORMERS = True |
|
except ImportError: |
|
logger.warning("β οΈ Transformers not available - using extractive summaries") |
|
HAS_TRANSFORMERS = False |
|
|
|
|
|
try: |
|
from perplexity_client import PerplexityClient, SearchType, SourceResult, SearchResults |
|
logger.info("β
Perplexity client imported successfully") |
|
HAS_PERPLEXITY = True |
|
except ImportError: |
|
logger.warning("β οΈ Perplexity client not available - manual source entry only") |
|
HAS_PERPLEXITY = False |
|
|
|
|
|
DATASET_TEMPLATES = { |
|
"sentiment_analysis": { |
|
"name": "π Sentiment Analysis", |
|
"description": "Classify text as positive, negative, or neutral", |
|
"fields": ["text", "sentiment"], |
|
"example": {"text": "This product is amazing!", "sentiment": "positive"}, |
|
"search_queries": ["product reviews", "customer feedback", "social media posts", "movie reviews"] |
|
}, |
|
"text_classification": { |
|
"name": "π Text Classification", |
|
"description": "Categorize text into predefined classes", |
|
"fields": ["text", "category"], |
|
"example": {"text": "Breaking: Stock market reaches new high", "category": "finance"}, |
|
"search_queries": ["news articles", "blog posts", "academic papers", "forum discussions"] |
|
}, |
|
"named_entity_recognition": { |
|
"name": "π·οΈ Named Entity Recognition", |
|
"description": "Identify people, places, organizations in text", |
|
"fields": ["text", "entities"], |
|
"example": {"text": "Apple Inc. was founded by Steve Jobs in California", |
|
"entities": [{"text": "Apple Inc.", "label": "ORG"}, {"text": "Steve Jobs", "label": "PERSON"}]}, |
|
"search_queries": ["news articles", "biographies", "company reports", "wikipedia articles"] |
|
}, |
|
"question_answering": { |
|
"name": "β Question Answering", |
|
"description": "Extract answers from context passages", |
|
"fields": ["context", "question", "answer"], |
|
"example": {"context": "The capital of France is Paris", "question": "What is the capital of France?", "answer": "Paris"}, |
|
"search_queries": ["FAQ pages", "educational content", "interview transcripts", "knowledge bases"] |
|
}, |
|
"text_summarization": { |
|
"name": "π Text Summarization", |
|
"description": "Generate concise summaries of longer texts", |
|
"fields": ["text", "summary"], |
|
"example": {"text": "Long article content...", "summary": "Brief summary of key points"}, |
|
"search_queries": ["news articles", "research papers", "blog posts", "reports"] |
|
}, |
|
"translation": { |
|
"name": "π Translation", |
|
"description": "Translate text between languages", |
|
"fields": ["source_text", "target_text", "source_lang", "target_lang"], |
|
"example": {"source_text": "Hello world", "target_text": "Hola mundo", "source_lang": "en", "target_lang": "es"}, |
|
"search_queries": ["multilingual websites", "international news", "translation datasets", "parallel corpora"] |
|
} |
|
} |
|
|
|
class DatasetStudio: |
|
""" |
|
π― Main Dataset Studio Class |
|
Handles all core functionality for dataset creation |
|
""" |
|
|
|
def __init__(self): |
|
"""Initialize the Dataset Studio""" |
|
logger.info("π Initializing AI Dataset Studio...") |
|
|
|
|
|
self.projects = {} |
|
self.current_project = None |
|
self.scraped_data = [] |
|
self.processed_data = [] |
|
|
|
|
|
self.sentiment_analyzer = None |
|
self.summarizer = None |
|
self.ner_model = None |
|
|
|
|
|
self.perplexity_client = None |
|
if HAS_PERPLEXITY: |
|
try: |
|
api_key = os.getenv('PERPLEXITY_API_KEY') |
|
if api_key: |
|
self.perplexity_client = PerplexityClient(api_key) |
|
logger.info("β
Perplexity AI client initialized") |
|
else: |
|
logger.warning("β οΈ PERPLEXITY_API_KEY not found - manual source entry only") |
|
except Exception as e: |
|
logger.error(f"β Failed to initialize Perplexity client: {e}") |
|
|
|
self._load_models() |
|
logger.info("β
Dataset Studio initialized successfully") |
|
|
|
def _load_models(self): |
|
"""Load AI models for processing""" |
|
if not HAS_TRANSFORMERS: |
|
logger.info("β οΈ Skipping model loading - transformers not available") |
|
return |
|
|
|
try: |
|
|
|
logger.info("π¦ Loading sentiment analysis model...") |
|
self.sentiment_analyzer = pipeline( |
|
"sentiment-analysis", |
|
model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
|
return_all_scores=True |
|
) |
|
logger.info("β
Sentiment analyzer loaded") |
|
|
|
except Exception as e: |
|
logger.warning(f"β οΈ Could not load sentiment analyzer: {e}") |
|
|
|
try: |
|
|
|
logger.info("π¦ Loading summarization model...") |
|
self.summarizer = pipeline( |
|
"summarization", |
|
model="facebook/bart-large-cnn", |
|
max_length=150, |
|
min_length=30, |
|
do_sample=False |
|
) |
|
logger.info("β
Summarizer loaded") |
|
|
|
except Exception as e: |
|
logger.warning(f"β οΈ Could not load summarizer: {e}") |
|
|
|
try: |
|
|
|
logger.info("π¦ Loading NER model...") |
|
self.ner_model = pipeline( |
|
"ner", |
|
model="dbmdz/bert-large-cased-finetuned-conll03-english", |
|
aggregation_strategy="simple" |
|
) |
|
logger.info("β
NER model loaded") |
|
|
|
except Exception as e: |
|
logger.warning(f"β οΈ Could not load NER model: {e}") |
|
|
|
def discover_sources_with_ai( |
|
self, |
|
project_description: str, |
|
max_sources: int = 20, |
|
search_type: str = "general", |
|
include_academic: bool = True, |
|
include_news: bool = True |
|
) -> Tuple[str, str]: |
|
""" |
|
π§ Discover sources using Perplexity AI |
|
|
|
Args: |
|
project_description: Description of the dataset project |
|
max_sources: Maximum number of sources to find |
|
search_type: Type of search (general, academic, news, etc.) |
|
include_academic: Include academic sources |
|
include_news: Include news sources |
|
|
|
Returns: |
|
Tuple of (status_message, sources_json) |
|
""" |
|
if not self.perplexity_client: |
|
return "β Perplexity AI not available. Please set PERPLEXITY_API_KEY environment variable.", "[]" |
|
|
|
try: |
|
logger.info(f"π Discovering sources for: {project_description}") |
|
|
|
|
|
search_type_enum = getattr(SearchType, search_type.upper(), SearchType.GENERAL) |
|
|
|
|
|
results = self.perplexity_client.discover_sources( |
|
project_description=project_description, |
|
search_type=search_type_enum, |
|
max_sources=max_sources, |
|
include_academic=include_academic, |
|
include_news=include_news |
|
) |
|
|
|
if not results.sources: |
|
return "β οΈ No sources found. Try adjusting your search terms.", "[]" |
|
|
|
|
|
sources_data = [] |
|
for source in results.sources: |
|
sources_data.append({ |
|
"URL": source.url, |
|
"Title": source.title, |
|
"Description": source.description, |
|
"Type": source.source_type, |
|
"Domain": source.domain, |
|
"Quality Score": f"{source.relevance_score:.1f}/10" |
|
}) |
|
|
|
status = f"β
Found {len(results.sources)} sources in {results.search_time:.1f}s" |
|
if results.suggestions: |
|
status += f"\nπ‘ Suggestions: {', '.join(results.suggestions[:3])}" |
|
|
|
return status, json.dumps(sources_data, indent=2) |
|
|
|
except Exception as e: |
|
logger.error(f"β Error discovering sources: {e}") |
|
return f"β Error: {str(e)}", "[]" |
|
|
|
def extract_urls_from_sources(self, sources_json: str) -> List[str]: |
|
"""Extract URLs from discovered sources JSON""" |
|
try: |
|
sources = json.loads(sources_json) |
|
if isinstance(sources, list): |
|
return [source.get("URL", "") for source in sources if source.get("URL")] |
|
return [] |
|
except: |
|
return [] |
|
|
|
def create_project(self, name: str, template: str, description: str) -> str: |
|
"""Create a new dataset project""" |
|
if not name.strip(): |
|
return "β Please provide a project name" |
|
|
|
project_id = f"project_{int(time.time())}" |
|
self.projects[project_id] = { |
|
"name": name, |
|
"template": template, |
|
"description": description, |
|
"created_at": datetime.now().isoformat(), |
|
"urls": [], |
|
"data": [], |
|
"processed_data": [] |
|
} |
|
|
|
self.current_project = project_id |
|
|
|
template_info = DATASET_TEMPLATES.get(template, {}) |
|
status = f"β
Project '{name}' created successfully!\n" |
|
status += f"π Template: {template_info.get('name', template)}\n" |
|
status += f"π Description: {description}\n" |
|
status += f"π Project ID: {project_id}" |
|
|
|
return status |
|
|
|
def scrape_urls(self, urls_text: str, progress=gr.Progress()) -> Tuple[str, str]: |
|
"""Scrape content from provided URLs""" |
|
if not self.current_project: |
|
return "β Please create a project first", "" |
|
|
|
|
|
urls = [] |
|
for line in urls_text.strip().split('\n'): |
|
url = line.strip() |
|
if url and self._is_valid_url(url): |
|
urls.append(url) |
|
|
|
if not urls: |
|
return "β No valid URLs found", "" |
|
|
|
scraped_data = [] |
|
failed_urls = [] |
|
|
|
progress(0, desc="Starting scraping...") |
|
|
|
for i, url in enumerate(urls): |
|
try: |
|
progress((i + 1) / len(urls), desc=f"Scraping {i + 1}/{len(urls)}") |
|
|
|
logger.info(f"π Scraping: {url}") |
|
|
|
|
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
} |
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
title = self._extract_title(soup) |
|
content = self._extract_content(soup) |
|
|
|
if content: |
|
scraped_data.append({ |
|
'url': url, |
|
'title': title, |
|
'content': content, |
|
'length': len(content), |
|
'scraped_at': datetime.now().isoformat() |
|
}) |
|
logger.info(f"β
Scraped {len(content)} characters from {url}") |
|
else: |
|
failed_urls.append(url) |
|
logger.warning(f"β οΈ No content extracted from {url}") |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
except Exception as e: |
|
failed_urls.append(url) |
|
logger.error(f"β Failed to scrape {url}: {e}") |
|
|
|
|
|
self.projects[self.current_project]['urls'] = urls |
|
self.projects[self.current_project]['data'] = scraped_data |
|
self.scraped_data = scraped_data |
|
|
|
|
|
status = f"β
Scraping completed!\n" |
|
status += f"π Successfully scraped: {len(scraped_data)} URLs\n" |
|
status += f"β Failed: {len(failed_urls)} URLs\n" |
|
status += f"π Total content: {sum(item['length'] for item in scraped_data):,} characters" |
|
|
|
if failed_urls: |
|
status += f"\n\nFailed URLs:\n" + "\n".join(f"β’ {url}" for url in failed_urls[:5]) |
|
if len(failed_urls) > 5: |
|
status += f"\n... and {len(failed_urls) - 5} more" |
|
|
|
|
|
preview_data = [] |
|
for item in scraped_data[:10]: |
|
preview_data.append({ |
|
"Title": item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], |
|
"URL": item['url'], |
|
"Length": f"{item['length']:,} chars", |
|
"Preview": item['content'][:100] + "..." if len(item['content']) > 100 else item['content'] |
|
}) |
|
|
|
return status, json.dumps(preview_data, indent=2) |
|
|
|
def process_data(self, template: str, progress=gr.Progress()) -> Tuple[str, str]: |
|
"""Process scraped data according to template""" |
|
if not self.scraped_data: |
|
return "β No scraped data available. Please scrape URLs first.", "" |
|
|
|
template_config = DATASET_TEMPLATES.get(template, {}) |
|
if not template_config: |
|
return f"β Unknown template: {template}", "" |
|
|
|
processed_data = [] |
|
|
|
progress(0, desc="Starting data processing...") |
|
|
|
for i, item in enumerate(self.scraped_data): |
|
try: |
|
progress((i + 1) / len(self.scraped_data), desc=f"Processing {i + 1}/{len(self.scraped_data)}") |
|
|
|
content = item['content'] |
|
|
|
|
|
if template == "sentiment_analysis": |
|
processed_item = self._process_sentiment_analysis(item) |
|
elif template == "text_classification": |
|
processed_item = self._process_text_classification(item) |
|
elif template == "named_entity_recognition": |
|
processed_item = self._process_ner(item) |
|
elif template == "question_answering": |
|
processed_item = self._process_qa(item) |
|
elif template == "text_summarization": |
|
processed_item = self._process_summarization(item) |
|
elif template == "translation": |
|
processed_item = self._process_translation(item) |
|
else: |
|
processed_item = self._process_generic(item) |
|
|
|
if processed_item: |
|
processed_data.extend(processed_item) |
|
|
|
except Exception as e: |
|
logger.error(f"β Error processing item {i}: {e}") |
|
continue |
|
|
|
|
|
self.processed_data = processed_data |
|
if self.current_project: |
|
self.projects[self.current_project]['processed_data'] = processed_data |
|
|
|
|
|
status = f"β
Processing completed!\n" |
|
status += f"π Generated {len(processed_data)} training examples\n" |
|
status += f"π Template: {template_config['name']}\n" |
|
status += f"π·οΈ Fields: {', '.join(template_config['fields'])}" |
|
|
|
|
|
preview_data = processed_data[:10] if processed_data else [] |
|
|
|
return status, json.dumps(preview_data, indent=2) |
|
|
|
def _process_sentiment_analysis(self, item: Dict) -> List[Dict]: |
|
"""Process item for sentiment analysis""" |
|
content = item['content'] |
|
|
|
|
|
if HAS_NLTK: |
|
try: |
|
sentences = sent_tokenize(content) |
|
except: |
|
sentences = content.split('. ') |
|
else: |
|
sentences = content.split('. ') |
|
|
|
results = [] |
|
|
|
for sentence in sentences: |
|
sentence = sentence.strip() |
|
if len(sentence) < 10 or len(sentence) > 500: |
|
continue |
|
|
|
|
|
if self.sentiment_analyzer: |
|
try: |
|
prediction = self.sentiment_analyzer(sentence)[0] |
|
|
|
label_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral'} |
|
sentiment = label_map.get(prediction[0]['label'], 'neutral') |
|
confidence = prediction[0]['score'] |
|
|
|
|
|
if confidence > 0.7: |
|
results.append({ |
|
'text': sentence, |
|
'sentiment': sentiment, |
|
'confidence': confidence, |
|
'source_url': item['url'] |
|
}) |
|
except Exception as e: |
|
logger.debug(f"Sentiment analysis failed: {e}") |
|
continue |
|
else: |
|
|
|
sentiment = self._keyword_sentiment(sentence) |
|
results.append({ |
|
'text': sentence, |
|
'sentiment': sentiment, |
|
'source_url': item['url'] |
|
}) |
|
|
|
return results[:20] |
|
|
|
def _process_text_classification(self, item: Dict) -> List[Dict]: |
|
"""Process item for text classification""" |
|
content = item['content'] |
|
|
|
|
|
url = item['url'] |
|
category = self._extract_category_from_url(url) |
|
|
|
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50] |
|
|
|
results = [] |
|
for paragraph in paragraphs[:10]: |
|
results.append({ |
|
'text': paragraph, |
|
'category': category, |
|
'source_url': url |
|
}) |
|
|
|
return results |
|
|
|
def _process_ner(self, item: Dict) -> List[Dict]: |
|
"""Process item for Named Entity Recognition""" |
|
content = item['content'] |
|
|
|
if HAS_NLTK: |
|
try: |
|
sentences = sent_tokenize(content) |
|
except: |
|
sentences = content.split('. ') |
|
else: |
|
sentences = content.split('. ') |
|
|
|
results = [] |
|
|
|
for sentence in sentences[:20]: |
|
sentence = sentence.strip() |
|
if len(sentence) < 20: |
|
continue |
|
|
|
entities = [] |
|
|
|
if self.ner_model: |
|
try: |
|
ner_results = self.ner_model(sentence) |
|
for entity in ner_results: |
|
entities.append({ |
|
'text': entity['word'], |
|
'label': entity['entity_group'], |
|
'confidence': entity['score'] |
|
}) |
|
except Exception as e: |
|
logger.debug(f"NER failed: {e}") |
|
|
|
|
|
if not entities: |
|
entities = self._simple_ner(sentence) |
|
|
|
if entities: |
|
results.append({ |
|
'text': sentence, |
|
'entities': entities, |
|
'source_url': item['url'] |
|
}) |
|
|
|
return results |
|
|
|
def _process_qa(self, item: Dict) -> List[Dict]: |
|
"""Process item for Question Answering""" |
|
content = item['content'] |
|
|
|
|
|
results = [] |
|
|
|
|
|
qa_patterns = [ |
|
(r'Q:\s*(.+?)\s*A:\s*(.+?)(?=Q:|$)', 'qa'), |
|
(r'Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)', 'qa'), |
|
(r'(.+\?)\s*(.+?)(?=.+\?|$)', 'simple') |
|
] |
|
|
|
for pattern, style in qa_patterns: |
|
matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE) |
|
|
|
for match in matches[:10]: |
|
if len(match) == 2: |
|
question = match[0].strip() |
|
answer = match[1].strip() |
|
|
|
if len(question) > 10 and len(answer) > 10: |
|
results.append({ |
|
'context': content[:500], |
|
'question': question, |
|
'answer': answer, |
|
'source_url': item['url'] |
|
}) |
|
|
|
return results |
|
|
|
def _process_summarization(self, item: Dict) -> List[Dict]: |
|
"""Process item for summarization""" |
|
content = item['content'] |
|
|
|
|
|
chunk_size = 1000 |
|
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] |
|
|
|
results = [] |
|
|
|
for chunk in chunks[:5]: |
|
if len(chunk) < 100: |
|
continue |
|
|
|
summary = "" |
|
|
|
if self.summarizer and len(chunk) > 100: |
|
try: |
|
summary_result = self.summarizer(chunk, max_length=100, min_length=30) |
|
summary = summary_result[0]['summary_text'] |
|
except Exception as e: |
|
logger.debug(f"Summarization failed: {e}") |
|
|
|
|
|
if not summary: |
|
summary = self._extractive_summary(chunk) |
|
|
|
if summary: |
|
results.append({ |
|
'text': chunk, |
|
'summary': summary, |
|
'source_url': item['url'] |
|
}) |
|
|
|
return results |
|
|
|
def _process_translation(self, item: Dict) -> List[Dict]: |
|
"""Process item for translation (placeholder)""" |
|
|
|
|
|
return [] |
|
|
|
def _process_generic(self, item: Dict) -> List[Dict]: |
|
"""Generic processing for unknown templates""" |
|
content = item['content'] |
|
|
|
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50] |
|
|
|
results = [] |
|
for paragraph in paragraphs[:10]: |
|
results.append({ |
|
'text': paragraph, |
|
'source_url': item['url'] |
|
}) |
|
|
|
return results |
|
|
|
def export_dataset(self, format_type: str) -> Tuple[str, str]: |
|
"""Export processed dataset""" |
|
if not self.processed_data: |
|
return "β No processed data available", "" |
|
|
|
try: |
|
if format_type == "JSON": |
|
data = json.dumps(self.processed_data, indent=2) |
|
filename = f"dataset_{int(time.time())}.json" |
|
|
|
elif format_type == "CSV": |
|
df = pd.DataFrame(self.processed_data) |
|
data = df.to_csv(index=False) |
|
filename = f"dataset_{int(time.time())}.csv" |
|
|
|
elif format_type == "HuggingFace Dataset": |
|
|
|
hf_data = { |
|
"data": self.processed_data, |
|
"info": { |
|
"description": "AI Dataset Studio generated dataset", |
|
"created_at": datetime.now().isoformat(), |
|
"size": len(self.processed_data) |
|
} |
|
} |
|
data = json.dumps(hf_data, indent=2) |
|
filename = f"hf_dataset_{int(time.time())}.json" |
|
|
|
elif format_type == "JSONL": |
|
lines = [json.dumps(item) for item in self.processed_data] |
|
data = '\n'.join(lines) |
|
filename = f"dataset_{int(time.time())}.jsonl" |
|
|
|
else: |
|
return "β Unsupported format", "" |
|
|
|
|
|
temp_path = f"/tmp/{filename}" |
|
with open(temp_path, 'w', encoding='utf-8') as f: |
|
f.write(data) |
|
|
|
status = f"β
Dataset exported successfully!\n" |
|
status += f"π Records: {len(self.processed_data)}\n" |
|
status += f"π Format: {format_type}\n" |
|
status += f"π Size: {len(data):,} characters" |
|
|
|
return status, temp_path |
|
|
|
except Exception as e: |
|
logger.error(f"Export failed: {e}") |
|
return f"β Export failed: {str(e)}", "" |
|
|
|
|
|
def _is_valid_url(self, url: str) -> bool: |
|
"""Validate URL format""" |
|
try: |
|
result = urlparse(url) |
|
return all([result.scheme, result.netloc]) |
|
except: |
|
return False |
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> str: |
|
"""Extract title from HTML""" |
|
title_tag = soup.find('title') |
|
if title_tag: |
|
return title_tag.get_text().strip() |
|
|
|
h1_tag = soup.find('h1') |
|
if h1_tag: |
|
return h1_tag.get_text().strip() |
|
|
|
return "Untitled" |
|
|
|
def _extract_content(self, soup: BeautifulSoup) -> str: |
|
"""Extract main content from HTML""" |
|
|
|
for script in soup(["script", "style", "nav", "footer", "header"]): |
|
script.decompose() |
|
|
|
|
|
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article')) |
|
|
|
if main_content: |
|
text = main_content.get_text() |
|
else: |
|
text = soup.get_text() |
|
|
|
|
|
lines = (line.strip() for line in text.splitlines()) |
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
text = ' '.join(chunk for chunk in chunks if chunk) |
|
|
|
return text |
|
|
|
def _keyword_sentiment(self, text: str) -> str: |
|
"""Simple keyword-based sentiment analysis""" |
|
positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like'] |
|
negative_words = ['bad', 'terrible', 'awful', 'hate', 'dislike', 'horrible', 'worst'] |
|
|
|
text_lower = text.lower() |
|
|
|
pos_count = sum(1 for word in positive_words if word in text_lower) |
|
neg_count = sum(1 for word in negative_words if word in text_lower) |
|
|
|
if pos_count > neg_count: |
|
return 'positive' |
|
elif neg_count > pos_count: |
|
return 'negative' |
|
else: |
|
return 'neutral' |
|
|
|
def _extract_category_from_url(self, url: str) -> str: |
|
"""Extract category based on URL domain/path""" |
|
domain = urlparse(url).netloc.lower() |
|
|
|
if any(news in domain for news in ['cnn', 'bbc', 'reuters', 'news']): |
|
return 'news' |
|
elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech']): |
|
return 'technology' |
|
elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business']): |
|
return 'business' |
|
elif any(sport in domain for sport in ['espn', 'sport']): |
|
return 'sports' |
|
else: |
|
return 'general' |
|
|
|
def _simple_ner(self, text: str) -> List[Dict]: |
|
"""Simple pattern-based NER""" |
|
entities = [] |
|
|
|
|
|
cap_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) |
|
|
|
for word in cap_words: |
|
if len(word) > 2: |
|
entities.append({ |
|
'text': word, |
|
'label': 'MISC', |
|
'confidence': 0.5 |
|
}) |
|
|
|
return entities[:5] |
|
|
|
def _extractive_summary(self, text: str) -> str: |
|
"""Simple extractive summarization""" |
|
sentences = text.split('. ') |
|
|
|
if len(sentences) <= 2: |
|
return text |
|
|
|
|
|
summary = f"{sentences[0]}. {sentences[-1]}" |
|
|
|
return summary |
|
|
|
def create_modern_interface(): |
|
"""Create the modern Gradio interface""" |
|
logger.info("π¨ Creating modern interface...") |
|
|
|
|
|
studio = DatasetStudio() |
|
|
|
|
|
custom_css = """ |
|
.gradio-container { |
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
} |
|
|
|
.main-header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 2rem; |
|
border-radius: 10px; |
|
margin-bottom: 2rem; |
|
text-align: center; |
|
} |
|
|
|
.step-header { |
|
background: linear-gradient(90deg, #4facfe 0%, #00f2fe 100%); |
|
color: white; |
|
padding: 1rem; |
|
border-radius: 8px; |
|
margin: 1rem 0; |
|
font-weight: bold; |
|
} |
|
|
|
.template-card { |
|
border: 2px solid #e1e5e9; |
|
border-radius: 10px; |
|
padding: 1rem; |
|
margin: 0.5rem; |
|
transition: all 0.3s ease; |
|
} |
|
|
|
.template-card:hover { |
|
border-color: #4facfe; |
|
box-shadow: 0 4px 12px rgba(79, 172, 254, 0.3); |
|
} |
|
|
|
.status-success { |
|
background-color: #d4edda; |
|
border-color: #c3e6cb; |
|
color: #155724; |
|
padding: 1rem; |
|
border-radius: 5px; |
|
border-left: 4px solid #28a745; |
|
} |
|
|
|
.status-error { |
|
background-color: #f8d7da; |
|
border-color: #f5c6cb; |
|
color: #721c24; |
|
padding: 1rem; |
|
border-radius: 5px; |
|
border-left: 4px solid #dc3545; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=custom_css, title="π AI Dataset Studio", theme=gr.themes.Soft()) as interface: |
|
|
|
gr.HTML(""" |
|
<div class="main-header"> |
|
<h1>π AI Dataset Studio</h1> |
|
<p>Create high-quality training datasets with AI-powered source discovery</p> |
|
<p><strong>π§ Powered by Perplexity AI β’ π€ Advanced NLP β’ π Professional Export</strong></p> |
|
</div> |
|
""") |
|
|
|
with gr.Tabs() as tabs: |
|
|
|
with gr.TabItem("1οΈβ£ Project Setup", id=0): |
|
gr.HTML('<div class="step-header">π Step 1: Create Your Dataset Project</div>') |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
project_name = gr.Textbox( |
|
label="π·οΈ Project Name", |
|
placeholder="e.g., Customer Review Sentiment Analysis", |
|
info="Give your dataset project a descriptive name" |
|
) |
|
|
|
project_description = gr.Textbox( |
|
label="π Project Description", |
|
lines=3, |
|
placeholder="Describe what kind of dataset you want to create...", |
|
info="This will be used by AI to discover relevant sources" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
|
|
template_choices = list(DATASET_TEMPLATES.keys()) |
|
template_labels = [DATASET_TEMPLATES[t]["name"] for t in template_choices] |
|
|
|
template_selector = gr.Dropdown( |
|
choices=list(zip(template_labels, template_choices)), |
|
label="π Dataset Template", |
|
value=(template_labels[0], template_choices[0]), |
|
info="Choose the type of ML task" |
|
) |
|
|
|
|
|
template_info = gr.Markdown("Select a template to see details") |
|
|
|
create_project_btn = gr.Button("π― Create Project", variant="primary", size="lg") |
|
project_status = gr.Textbox(label="π Project Status", interactive=False) |
|
|
|
|
|
def update_template_info(template_choice): |
|
if template_choice and len(template_choice) > 1: |
|
template_key = template_choice[1] |
|
template = DATASET_TEMPLATES.get(template_key, {}) |
|
info = f"**{template.get('name', '')}**\n\n" |
|
info += f"π {template.get('description', '')}\n\n" |
|
info += f"π·οΈ **Fields:** {', '.join(template.get('fields', []))}\n\n" |
|
info += f"π‘ **Example:** `{template.get('example', {})}`" |
|
return info |
|
return "Select a template to see details" |
|
|
|
template_selector.change( |
|
fn=update_template_info, |
|
inputs=[template_selector], |
|
outputs=[template_info] |
|
) |
|
|
|
|
|
with gr.TabItem("2οΈβ£ AI Source Discovery", id=1): |
|
gr.HTML('<div class="step-header">π§ Step 2: Discover Sources with Perplexity AI</div>') |
|
|
|
if HAS_PERPLEXITY: |
|
gr.Markdown(""" |
|
β¨ **AI-Powered Source Discovery** - Let Perplexity AI find the best sources for your dataset! |
|
|
|
Just describe your project and AI will discover relevant, high-quality sources automatically. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
ai_search_description = gr.Textbox( |
|
label="π― Project Description for AI Search", |
|
lines=3, |
|
placeholder="e.g., I need product reviews for sentiment analysis training data...", |
|
info="Describe what sources you need - be specific!" |
|
) |
|
|
|
with gr.Row(): |
|
search_type = gr.Dropdown( |
|
choices=["general", "academic", "news", "technical"], |
|
value="general", |
|
label="π Search Type" |
|
) |
|
|
|
max_sources = gr.Slider( |
|
minimum=5, |
|
maximum=50, |
|
value=20, |
|
step=5, |
|
label="π Max Sources" |
|
) |
|
|
|
with gr.Row(): |
|
include_academic = gr.Checkbox(label="π Include Academic Sources", value=True) |
|
include_news = gr.Checkbox(label="π° Include News Sources", value=True) |
|
|
|
discover_btn = gr.Button("π§ Discover Sources with AI", variant="primary", size="lg") |
|
|
|
ai_search_status = gr.Textbox(label="π Discovery Status", interactive=False) |
|
discovered_sources = gr.Code(label="π Discovered Sources", language="json", interactive=False) |
|
|
|
|
|
use_ai_sources_btn = gr.Button("β
Use These Sources", variant="secondary") |
|
|
|
else: |
|
gr.Markdown(""" |
|
β οΈ **Perplexity AI Not Available** |
|
|
|
To enable AI-powered source discovery, set your `PERPLEXITY_API_KEY` environment variable. |
|
For now, you can manually enter URLs below. |
|
""") |
|
|
|
discovered_sources = gr.Code(value="[]", visible=False) |
|
|
|
gr.HTML('<div class="step-header">π Manual URL Entry</div>') |
|
|
|
urls_input = gr.Textbox( |
|
label="π URLs to Scrape", |
|
lines=10, |
|
placeholder="https://example.com/article1\nhttps://example.com/article2\n...", |
|
info="Enter one URL per line" |
|
) |
|
|
|
scrape_btn = gr.Button("π·οΈ Start Scraping", variant="primary", size="lg") |
|
scrape_status = gr.Textbox(label="π Scraping Status", interactive=False) |
|
scraped_preview = gr.Code(label="π Scraped Data Preview", language="json", interactive=False) |
|
|
|
|
|
with gr.TabItem("3οΈβ£ Data Processing", id=2): |
|
gr.HTML('<div class="step-header">βοΈ Step 3: Process Data with AI</div>') |
|
|
|
processing_template = gr.Dropdown( |
|
choices=list(zip(template_labels, template_choices)), |
|
label="π Processing Template", |
|
value=(template_labels[0], template_choices[0]), |
|
info="How should the data be processed?" |
|
) |
|
|
|
process_btn = gr.Button("βοΈ Process Data", variant="primary", size="lg") |
|
process_status = gr.Textbox(label="π Processing Status", interactive=False) |
|
processed_preview = gr.Code(label="π― Processed Data Preview", language="json", interactive=False) |
|
|
|
|
|
with gr.TabItem("4οΈβ£ Export Dataset", id=3): |
|
gr.HTML('<div class="step-header">π¦ Step 4: Export Your Dataset</div>') |
|
|
|
export_format = gr.Dropdown( |
|
choices=["JSON", "CSV", "HuggingFace Dataset", "JSONL"], |
|
value="JSON", |
|
label="π Export Format", |
|
info="Choose format for your dataset" |
|
) |
|
|
|
export_btn = gr.Button("π¦ Export Dataset", variant="primary", size="lg") |
|
export_status = gr.Textbox(label="π Export Status", interactive=False) |
|
download_file = gr.File(label="πΎ Download Dataset", interactive=False) |
|
|
|
|
|
create_project_btn.click( |
|
fn=lambda name, desc, template: studio.create_project(name, template[1] if template else "", desc), |
|
inputs=[project_name, project_description, template_selector], |
|
outputs=[project_status] |
|
) |
|
|
|
if HAS_PERPLEXITY: |
|
discover_btn.click( |
|
fn=studio.discover_sources_with_ai, |
|
inputs=[ai_search_description, max_sources, search_type, include_academic, include_news], |
|
outputs=[ai_search_status, discovered_sources] |
|
) |
|
|
|
use_ai_sources_btn.click( |
|
fn=lambda sources_json: '\n'.join(studio.extract_urls_from_sources(sources_json)), |
|
inputs=[discovered_sources], |
|
outputs=[urls_input] |
|
) |
|
|
|
scrape_btn.click( |
|
fn=studio.scrape_urls, |
|
inputs=[urls_input], |
|
outputs=[scrape_status, scraped_preview] |
|
) |
|
|
|
process_btn.click( |
|
fn=lambda template: studio.process_data(template[1] if template else ""), |
|
inputs=[processing_template], |
|
outputs=[process_status, processed_preview] |
|
) |
|
|
|
export_btn.click( |
|
fn=studio.export_dataset, |
|
inputs=[export_format], |
|
outputs=[export_status, download_file] |
|
) |
|
|
|
logger.info("β
Interface created successfully") |
|
return interface |
|
|
|
|
|
try: |
|
logger.info("π Starting AI Dataset Studio...") |
|
logger.info("π Features: β
AI Models | β
Advanced NLP | β
HuggingFace Integration") |
|
|
|
interface = create_modern_interface() |
|
|
|
logger.info("β
Application startup successful") |
|
|
|
if __name__ == "__main__": |
|
interface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"β Failed to launch application: {e}") |
|
logger.error(f"Traceback: {traceback.format_exc()}") |
|
sys.exit(1) |