|
""" |
|
AI Dataset Studio - Minimal Version |
|
Guaranteed to work with basic dependencies only |
|
""" |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import json |
|
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse |
|
from datetime import datetime |
|
import logging |
|
from typing import Dict, List, Tuple, Optional, Any |
|
from dataclasses import dataclass, asdict |
|
import uuid |
|
import time |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
@dataclass |
|
class SimpleScrapedItem: |
|
"""Simplified scraped content structure""" |
|
id: str |
|
url: str |
|
title: str |
|
content: str |
|
word_count: int |
|
scraped_at: str |
|
quality_score: float = 0.0 |
|
|
|
class SimpleWebScraper: |
|
"""Simplified web scraper with basic functionality""" |
|
|
|
def __init__(self): |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' |
|
}) |
|
|
|
def scrape_url(self, url: str) -> Optional[SimpleScrapedItem]: |
|
"""Scrape a single URL""" |
|
try: |
|
if not self._validate_url(url): |
|
return None |
|
|
|
response = self.session.get(url, timeout=10) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
title_tag = soup.find('title') |
|
title = title_tag.get_text().strip() if title_tag else "Untitled" |
|
|
|
|
|
|
|
for element in soup(['script', 'style', 'nav', 'header', 'footer']): |
|
element.decompose() |
|
|
|
|
|
content_element = (soup.find('article') or |
|
soup.find('main') or |
|
soup.find(class_='content') or |
|
soup.find('body')) |
|
|
|
if content_element: |
|
content = content_element.get_text(separator=' ', strip=True) |
|
else: |
|
content = soup.get_text(separator=' ', strip=True) |
|
|
|
|
|
content = re.sub(r'\s+', ' ', content).strip() |
|
|
|
|
|
word_count = len(content.split()) |
|
quality_score = min(1.0, word_count / 100) if word_count > 0 else 0.0 |
|
|
|
return SimpleScrapedItem( |
|
id=str(uuid.uuid4()), |
|
url=url, |
|
title=title, |
|
content=content, |
|
word_count=word_count, |
|
scraped_at=datetime.now().isoformat(), |
|
quality_score=quality_score |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to scrape {url}: {e}") |
|
return None |
|
|
|
def _validate_url(self, url: str) -> bool: |
|
"""Basic URL validation""" |
|
try: |
|
parsed = urlparse(url) |
|
return parsed.scheme in ['http', 'https'] and parsed.netloc |
|
except: |
|
return False |
|
|
|
def batch_scrape(self, urls: List[str], progress_callback=None) -> List[SimpleScrapedItem]: |
|
"""Scrape multiple URLs""" |
|
results = [] |
|
total = len(urls) |
|
|
|
for i, url in enumerate(urls): |
|
if progress_callback: |
|
progress_callback((i + 1) / total, f"Scraping {i+1}/{total}") |
|
|
|
item = self.scrape_url(url) |
|
if item: |
|
results.append(item) |
|
|
|
time.sleep(1) |
|
|
|
return results |
|
|
|
class SimpleDataProcessor: |
|
"""Basic data processing""" |
|
|
|
def process_items(self, items: List[SimpleScrapedItem], options: Dict[str, bool]) -> List[SimpleScrapedItem]: |
|
"""Process scraped items""" |
|
processed = [] |
|
|
|
for item in items: |
|
|
|
if options.get('quality_filter', True) and item.quality_score < 0.3: |
|
continue |
|
|
|
|
|
if options.get('clean_text', True): |
|
item.content = self._clean_text(item.content) |
|
|
|
processed.append(item) |
|
|
|
return processed |
|
|
|
def _clean_text(self, text: str) -> str: |
|
"""Basic text cleaning""" |
|
|
|
text = re.sub(r'http\S+', '', text) |
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
text = re.sub(r'(Click here|Read more|Subscribe|Advertisement)', '', text, flags=re.IGNORECASE) |
|
return text.strip() |
|
|
|
class SimpleExporter: |
|
"""Basic export functionality""" |
|
|
|
def export_dataset(self, items: List[SimpleScrapedItem], format_type: str) -> str: |
|
"""Export dataset""" |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
|
if format_type == "json": |
|
filename = f"dataset_{timestamp}.json" |
|
data = [asdict(item) for item in items] |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
return filename |
|
|
|
elif format_type == "csv": |
|
filename = f"dataset_{timestamp}.csv" |
|
data = [asdict(item) for item in items] |
|
df = pd.DataFrame(data) |
|
df.to_csv(filename, index=False) |
|
return filename |
|
|
|
else: |
|
raise ValueError(f"Unsupported format: {format_type}") |
|
|
|
class SimpleDatasetStudio: |
|
"""Simplified main application""" |
|
|
|
def __init__(self): |
|
self.scraper = SimpleWebScraper() |
|
self.processor = SimpleDataProcessor() |
|
self.exporter = SimpleExporter() |
|
|
|
self.scraped_items = [] |
|
self.processed_items = [] |
|
self.current_project = None |
|
|
|
def create_project(self, name: str) -> Dict[str, Any]: |
|
"""Create a new project""" |
|
self.current_project = { |
|
'name': name, |
|
'id': str(uuid.uuid4()), |
|
'created_at': datetime.now().isoformat() |
|
} |
|
self.scraped_items = [] |
|
self.processed_items = [] |
|
return self.current_project |
|
|
|
def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: |
|
"""Scrape URLs""" |
|
url_list = [url.strip() for url in urls if url.strip()] |
|
if not url_list: |
|
return 0, ["No valid URLs provided"] |
|
|
|
self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) |
|
success_count = len(self.scraped_items) |
|
failed_count = len(url_list) - success_count |
|
|
|
errors = [] |
|
if failed_count > 0: |
|
errors.append(f"{failed_count} URLs failed") |
|
|
|
return success_count, errors |
|
|
|
def process_data(self, options: Dict[str, bool]) -> int: |
|
"""Process scraped data""" |
|
if not self.scraped_items: |
|
return 0 |
|
|
|
self.processed_items = self.processor.process_items(self.scraped_items, options) |
|
return len(self.processed_items) |
|
|
|
def get_preview(self) -> List[Dict[str, Any]]: |
|
"""Get data preview""" |
|
items = self.processed_items or self.scraped_items |
|
preview = [] |
|
|
|
for item in items[:5]: |
|
preview.append({ |
|
'Title': item.title[:50] + "..." if len(item.title) > 50 else item.title, |
|
'Content Preview': item.content[:100] + "..." if len(item.content) > 100 else item.content, |
|
'Word Count': item.word_count, |
|
'Quality Score': round(item.quality_score, 2), |
|
'URL': item.url[:50] + "..." if len(item.url) > 50 else item.url |
|
}) |
|
|
|
return preview |
|
|
|
def get_stats(self) -> Dict[str, Any]: |
|
"""Get dataset statistics""" |
|
items = self.processed_items or self.scraped_items |
|
if not items: |
|
return {} |
|
|
|
word_counts = [item.word_count for item in items] |
|
quality_scores = [item.quality_score for item in items] |
|
|
|
return { |
|
'total_items': len(items), |
|
'avg_word_count': round(sum(word_counts) / len(word_counts)), |
|
'avg_quality': round(sum(quality_scores) / len(quality_scores), 2), |
|
'min_words': min(word_counts), |
|
'max_words': max(word_counts) |
|
} |
|
|
|
def export_data(self, format_type: str) -> str: |
|
"""Export dataset""" |
|
items = self.processed_items or self.scraped_items |
|
if not items: |
|
raise ValueError("No data to export") |
|
|
|
return self.exporter.export_dataset(items, format_type) |
|
|
|
def create_simple_interface(): |
|
"""Create simplified Gradio interface""" |
|
|
|
studio = SimpleDatasetStudio() |
|
|
|
|
|
css = """ |
|
.container { max-width: 1200px; margin: auto; } |
|
.header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; padding: 2rem; border-radius: 10px; |
|
text-align: center; margin-bottom: 2rem; |
|
} |
|
.step-box { |
|
background: #f8f9ff; border: 1px solid #e1e5ff; |
|
border-radius: 8px; padding: 1.5rem; margin: 1rem 0; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css, title="AI Dataset Studio - Simple") as interface: |
|
|
|
|
|
gr.HTML(""" |
|
<div class="header"> |
|
<h1>π AI Dataset Studio - Simple Version</h1> |
|
<p>Create datasets from web content - No complex setup required!</p> |
|
</div> |
|
""") |
|
|
|
|
|
project_state = gr.State({}) |
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Project Setup"): |
|
gr.HTML('<div class="step-box"><h3>Step 1: Create Your Project</h3></div>') |
|
|
|
project_name = gr.Textbox( |
|
label="Project Name", |
|
placeholder="e.g., News Articles Dataset", |
|
value="My Dataset" |
|
) |
|
|
|
create_btn = gr.Button("Create Project", variant="primary") |
|
project_status = gr.Markdown("") |
|
|
|
def create_project_handler(name): |
|
if not name.strip(): |
|
return "β Please enter a project name", {} |
|
|
|
project = studio.create_project(name.strip()) |
|
status = f""" |
|
β
**Project Created!** |
|
|
|
**Name:** {project['name']} |
|
**ID:** {project['id'][:8]}... |
|
**Created:** {project['created_at'][:19]} |
|
|
|
π Next: Go to Data Collection tab |
|
""" |
|
return status, project |
|
|
|
create_btn.click( |
|
fn=create_project_handler, |
|
inputs=[project_name], |
|
outputs=[project_status, project_state] |
|
) |
|
|
|
|
|
with gr.Tab("π·οΈ Data Collection"): |
|
gr.HTML('<div class="step-box"><h3>Step 2: Scrape Web Content</h3></div>') |
|
|
|
urls_input = gr.Textbox( |
|
label="URLs to Scrape (one per line)", |
|
placeholder="https://example.com/article1\nhttps://example.com/article2", |
|
lines=6 |
|
) |
|
|
|
scrape_btn = gr.Button("Start Scraping", variant="primary") |
|
scrape_status = gr.Markdown("") |
|
|
|
def scrape_handler(urls_text, project, progress=gr.Progress()): |
|
if not project: |
|
return "β Create a project first" |
|
|
|
urls = [url.strip() for url in urls_text.split('\n') if url.strip()] |
|
if not urls: |
|
return "β No URLs provided" |
|
|
|
def progress_callback(pct, msg): |
|
progress(pct, desc=msg) |
|
|
|
success_count, errors = studio.scrape_urls(urls, progress_callback) |
|
|
|
if success_count > 0: |
|
return f""" |
|
β
**Scraping Complete!** |
|
|
|
**Success:** {success_count} URLs |
|
**Failed:** {len(urls) - success_count} URLs |
|
|
|
π Next: Go to Data Processing tab |
|
""" |
|
else: |
|
return f"β Scraping failed: {', '.join(errors)}" |
|
|
|
scrape_btn.click( |
|
fn=scrape_handler, |
|
inputs=[urls_input, project_state], |
|
outputs=[scrape_status] |
|
) |
|
|
|
|
|
with gr.Tab("βοΈ Data Processing"): |
|
gr.HTML('<div class="step-box"><h3>Step 3: Clean and Process Data</h3></div>') |
|
|
|
with gr.Row(): |
|
clean_text = gr.Checkbox(label="Clean Text", value=True) |
|
quality_filter = gr.Checkbox(label="Quality Filter", value=True) |
|
|
|
process_btn = gr.Button("Process Data", variant="primary") |
|
process_status = gr.Markdown("") |
|
|
|
def process_handler(clean, quality, project): |
|
if not project: |
|
return "β Create a project first" |
|
|
|
options = { |
|
'clean_text': clean, |
|
'quality_filter': quality |
|
} |
|
|
|
processed_count = studio.process_data(options) |
|
|
|
if processed_count > 0: |
|
return f""" |
|
β
**Processing Complete!** |
|
|
|
**Processed:** {processed_count} items |
|
|
|
π Next: Check Data Preview tab |
|
""" |
|
else: |
|
return "β No items passed processing filters" |
|
|
|
process_btn.click( |
|
fn=process_handler, |
|
inputs=[clean_text, quality_filter, project_state], |
|
outputs=[process_status] |
|
) |
|
|
|
|
|
with gr.Tab("π Data Preview"): |
|
gr.HTML('<div class="step-box"><h3>Step 4: Review Your Dataset</h3></div>') |
|
|
|
refresh_btn = gr.Button("Refresh Preview") |
|
preview_table = gr.DataFrame(label="Dataset Preview") |
|
stats_display = gr.JSON(label="Statistics") |
|
|
|
def refresh_handler(project): |
|
if not project: |
|
return None, {} |
|
|
|
preview = studio.get_preview() |
|
stats = studio.get_stats() |
|
return preview, stats |
|
|
|
refresh_btn.click( |
|
fn=refresh_handler, |
|
inputs=[project_state], |
|
outputs=[preview_table, stats_display] |
|
) |
|
|
|
|
|
with gr.Tab("π€ Export Dataset"): |
|
gr.HTML('<div class="step-box"><h3>Step 5: Export Your Dataset</h3></div>') |
|
|
|
export_format = gr.Radio( |
|
choices=["JSON", "CSV"], |
|
label="Export Format", |
|
value="JSON" |
|
) |
|
|
|
export_btn = gr.Button("Export Dataset", variant="primary") |
|
export_status = gr.Markdown("") |
|
export_file = gr.File(label="Download", visible=False) |
|
|
|
def export_handler(format_type, project): |
|
if not project: |
|
return "β Create a project first", None |
|
|
|
try: |
|
filename = studio.export_data(format_type.lower()) |
|
return f"β
Export successful! File: {filename}", filename |
|
except Exception as e: |
|
return f"β Export failed: {str(e)}", None |
|
|
|
export_btn.click( |
|
fn=export_handler, |
|
inputs=[export_format, project_state], |
|
outputs=[export_status, export_file] |
|
) |
|
|
|
|
|
with gr.Accordion("π Quick Guide", open=False): |
|
gr.Markdown(""" |
|
## How to Use |
|
|
|
1. **Create Project** - Give your dataset a name |
|
2. **Add URLs** - Paste URLs of web pages to scrape |
|
3. **Process Data** - Clean and filter the content |
|
4. **Review** - Check the quality of your dataset |
|
5. **Export** - Download in JSON or CSV format |
|
|
|
## Features |
|
- β
Smart content extraction |
|
- β
Quality filtering |
|
- β
Text cleaning |
|
- β
JSON/CSV export |
|
- β
Preview and statistics |
|
|
|
## Tips |
|
- Use high-quality source URLs |
|
- Enable quality filtering for better results |
|
- Review your data before exporting |
|
- Start with 5-10 URLs to test |
|
""") |
|
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
logger.info("π Starting AI Dataset Studio (Simple Version)") |
|
|
|
try: |
|
interface = create_simple_interface() |
|
logger.info("β
Simple interface created successfully") |
|
|
|
interface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"β Failed to launch: {e}") |
|
print("\nπ‘ If you see import errors, try installing:") |
|
print("pip install gradio pandas requests beautifulsoup4") |
|
raise |