MagicMeWizard commited on
Commit
a4ca225
·
verified ·
1 Parent(s): e5de17a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1210 -525
app.py CHANGED
@@ -1,149 +1,146 @@
1
  """
2
- AI-Powered Web Scraper - app.py
3
- Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces
 
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import gradio as gr
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from urllib.parse import urljoin, urlparse
10
  import pandas as pd
11
- from datetime import datetime
12
  import json
13
  import re
14
- import time
15
- from typing import List, Dict, Optional, Tuple
 
 
16
  import logging
 
 
17
  from pathlib import Path
18
- import os
19
- from dataclasses import dataclass
20
- from transformers import pipeline
21
- import nltk
22
- from nltk.tokenize import sent_tokenize
23
- import asyncio
24
- import aiohttp
25
- from concurrent.futures import ThreadPoolExecutor
26
  import hashlib
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Download required NLTK data
29
  try:
30
- nltk.data.find('tokenizers/punkt')
31
- except LookupError:
32
- nltk.download('punkt', quiet=True)
 
 
 
 
 
 
 
 
 
33
 
34
  # Configure logging
35
- logging.basicConfig(level=logging.INFO)
36
  logger = logging.getLogger(__name__)
37
 
 
 
 
 
 
 
 
 
 
38
  @dataclass
39
- class ScrapedContent:
40
- """Data class for scraped content with metadata"""
 
41
  url: str
42
  title: str
43
  content: str
44
- summary: str
 
45
  word_count: int
46
- reading_time: int
47
- extracted_at: str
48
- author: Optional[str] = None
49
- publish_date: Optional[str] = None
50
- meta_description: Optional[str] = None
51
- keywords: List[str] = None
52
 
53
- class SecurityValidator:
54
- """Security validation for URLs and content"""
55
-
56
- ALLOWED_SCHEMES = {'http', 'https'}
57
- BLOCKED_DOMAINS = {
58
- 'localhost', '127.0.0.1', '0.0.0.0',
59
- '192.168.', '10.', '172.16.', '172.17.',
60
- '172.18.', '172.19.', '172.20.', '172.21.',
61
- '172.22.', '172.23.', '172.24.', '172.25.',
62
- '172.26.', '172.27.', '172.28.', '172.29.',
63
- '172.30.', '172.31.'
64
- }
65
-
66
- @classmethod
67
- def validate_url(cls, url: str) -> Tuple[bool, str]:
68
- """Validate URL for security concerns"""
69
- try:
70
- parsed = urlparse(url)
71
-
72
- # Check scheme
73
- if parsed.scheme not in cls.ALLOWED_SCHEMES:
74
- return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed."
75
-
76
- # Check for blocked domains
77
- hostname = parsed.hostname or ''
78
- if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
79
- return False, "Access to internal/local networks is not allowed."
80
-
81
- # Basic malformed URL check
82
- if not parsed.netloc:
83
- return False, "Invalid URL format."
84
-
85
- return True, "URL is valid."
86
-
87
- except Exception as e:
88
- return False, f"URL validation error: {str(e)}"
89
 
90
- class RobotsTxtChecker:
91
- """Check robots.txt compliance"""
92
-
93
- @staticmethod
94
- def can_fetch(url: str, user_agent: str = "*") -> bool:
95
- """Check if URL can be fetched according to robots.txt"""
96
- try:
97
- parsed_url = urlparse(url)
98
- robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
99
-
100
- response = requests.get(robots_url, timeout=5)
101
- if response.status_code == 200:
102
- # Simple robots.txt parsing (basic implementation)
103
- lines = response.text.split('\n')
104
- user_agent_section = False
105
-
106
- for line in lines:
107
- line = line.strip()
108
- if line.startswith('User-agent:'):
109
- agent = line.split(':', 1)[1].strip()
110
- user_agent_section = agent == '*' or agent.lower() == user_agent.lower()
111
- elif user_agent_section and line.startswith('Disallow:'):
112
- disallowed = line.split(':', 1)[1].strip()
113
- if disallowed and url.endswith(disallowed):
114
- return False
115
-
116
- return True
117
-
118
- except Exception:
119
- # If robots.txt can't be fetched, assume allowed
120
- return True
121
 
122
- class ContentExtractor:
123
- """Advanced content extraction with multiple strategies"""
124
 
125
  def __init__(self):
126
  self.session = requests.Session()
127
  self.session.headers.update({
128
- 'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)',
129
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130
  'Accept-Language': 'en-US,en;q=0.5',
131
  'Accept-Encoding': 'gzip, deflate',
132
  'Connection': 'keep-alive',
133
- 'Upgrade-Insecure-Requests': '1',
134
  })
 
 
 
 
 
135
 
136
- def extract_content(self, url: str) -> Optional[ScrapedContent]:
137
- """Extract content from URL with robust error handling"""
 
 
 
 
138
  try:
139
- # Security validation
140
- is_valid, validation_msg = SecurityValidator.validate_url(url)
141
- if not is_valid:
142
- raise ValueError(f"Security validation failed: {validation_msg}")
143
-
144
- # Check robots.txt
145
- if not RobotsTxtChecker.can_fetch(url):
146
- raise ValueError("robots.txt disallows scraping this URL")
 
 
 
 
 
 
 
 
147
 
148
  # Fetch content
149
  response = self.session.get(url, timeout=15)
@@ -152,125 +149,82 @@ class ContentExtractor:
152
  # Parse HTML
153
  soup = BeautifulSoup(response.content, 'html.parser')
154
 
155
- # Extract metadata
156
  title = self._extract_title(soup)
157
- author = self._extract_author(soup)
158
- publish_date = self._extract_publish_date(soup)
159
- meta_description = self._extract_meta_description(soup)
160
-
161
- # Extract main content
162
- content = self._extract_main_content(soup)
163
-
164
- if not content or len(content.strip()) < 100:
165
- raise ValueError("Insufficient content extracted")
166
 
167
- # Calculate metrics
168
- word_count = len(content.split())
169
- reading_time = max(1, word_count // 200) # Average reading speed
170
-
171
- # Extract keywords
172
- keywords = self._extract_keywords(content)
173
-
174
- return ScrapedContent(
175
  url=url,
176
  title=title,
177
  content=content,
178
- summary="", # Will be filled by AI summarizer
179
- word_count=word_count,
180
- reading_time=reading_time,
181
- extracted_at=datetime.now().isoformat(),
182
- author=author,
183
- publish_date=publish_date,
184
- meta_description=meta_description,
185
- keywords=keywords
186
  )
187
 
 
 
188
  except Exception as e:
189
- logger.error(f"Content extraction failed for {url}: {str(e)}")
190
- raise
191
 
192
- def _extract_title(self, soup: BeautifulSoup) -> str:
193
- """Extract page title with fallbacks"""
194
- # Try meta og:title first
195
- og_title = soup.find('meta', property='og:title')
196
- if og_title and og_title.get('content'):
197
- return og_title['content'].strip()
198
-
199
- # Try regular title tag
200
- title_tag = soup.find('title')
201
- if title_tag:
202
- return title_tag.get_text().strip()
203
-
204
- # Try h1 as fallback
205
- h1_tag = soup.find('h1')
206
- if h1_tag:
207
- return h1_tag.get_text().strip()
208
-
209
- return "No title found"
210
-
211
- def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
212
- """Extract author information"""
213
- # Try multiple selectors for author
214
- author_selectors = [
215
- 'meta[name="author"]',
216
- 'meta[property="article:author"]',
217
- '.author',
218
- '.byline',
219
- '[rel="author"]'
220
- ]
221
 
222
- for selector in author_selectors:
223
- element = soup.select_one(selector)
224
- if element:
225
- if element.name == 'meta':
226
- return element.get('content', '').strip()
227
- else:
228
- return element.get_text().strip()
 
 
 
229
 
230
- return None
 
 
 
 
 
 
 
 
231
 
232
- def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
233
- """Extract publication date"""
234
- date_selectors = [
235
- 'meta[property="article:published_time"]',
236
- 'meta[name="publishdate"]',
237
- 'time[datetime]',
238
- '.publish-date',
239
- '.date'
240
  ]
241
 
242
- for selector in date_selectors:
243
  element = soup.select_one(selector)
244
  if element:
245
  if element.name == 'meta':
246
  return element.get('content', '').strip()
247
- elif element.name == 'time':
248
- return element.get('datetime', '').strip()
249
  else:
250
  return element.get_text().strip()
251
 
252
- return None
253
-
254
- def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
255
- """Extract meta description"""
256
- meta_desc = soup.find('meta', attrs={'name': 'description'})
257
- if meta_desc:
258
- return meta_desc.get('content', '').strip()
259
-
260
- og_desc = soup.find('meta', property='og:description')
261
- if og_desc:
262
- return og_desc.get('content', '').strip()
263
-
264
- return None
265
 
266
- def _extract_main_content(self, soup: BeautifulSoup) -> str:
267
- """Extract main content with multiple strategies"""
268
  # Remove unwanted elements
269
- for element in soup(['script', 'style', 'nav', 'header', 'footer',
270
- 'aside', 'advertisement', '.ads', '.sidebar']):
271
  element.decompose()
272
 
273
- # Try content-specific selectors first
274
  content_selectors = [
275
  'article',
276
  'main',
@@ -278,424 +232,1155 @@ class ContentExtractor:
278
  '.post-content',
279
  '.entry-content',
280
  '.article-body',
281
- '#content',
282
- '.story-body'
283
  ]
284
 
285
  for selector in content_selectors:
286
  element = soup.select_one(selector)
287
  if element:
288
  text = element.get_text(separator=' ', strip=True)
289
- if len(text) > 200: # Minimum content threshold
290
  return self._clean_text(text)
291
 
292
- # Fallback: extract from body
293
  body = soup.find('body')
294
  if body:
295
- text = body.get_text(separator=' ', strip=True)
296
- return self._clean_text(text)
297
 
298
- # Last resort: all text
299
  return self._clean_text(soup.get_text(separator=' ', strip=True))
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  def _clean_text(self, text: str) -> str:
302
  """Clean extracted text"""
303
  # Remove extra whitespace
304
  text = re.sub(r'\s+', ' ', text)
305
 
306
- # Remove common unwanted patterns
307
- text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
308
- text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
309
- text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
310
 
311
  return text.strip()
312
 
313
- def _extract_keywords(self, content: str) -> List[str]:
314
- """Extract basic keywords from content"""
315
- # Simple keyword extraction (can be enhanced with NLP)
316
- words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower())
317
- word_freq = {}
 
318
 
319
- for word in words:
320
- if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']:
321
- word_freq[word] = word_freq.get(word, 0) + 1
 
 
 
322
 
323
- # Return top 10 keywords
324
- sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
325
- return [word for word, freq in sorted_words[:10]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- class AISummarizer:
328
- """AI-powered content summarization"""
329
 
330
  def __init__(self):
331
- self.summarizer = None
332
- self._load_model()
 
 
333
 
334
- def _load_model(self):
335
- """Load summarization model with error handling"""
 
 
 
336
  try:
337
- self.summarizer = pipeline(
338
- "summarization",
339
- model="facebook/bart-large-cnn",
340
- tokenizer="facebook/bart-large-cnn"
 
 
 
 
 
 
 
341
  )
342
- logger.info("Summarization model loaded successfully")
 
343
  except Exception as e:
344
- logger.error(f"Failed to load summarization model: {e}")
345
- # Fallback to a smaller model
346
- try:
347
- self.summarizer = pipeline(
348
- "summarization",
349
- model="sshleifer/distilbart-cnn-12-6"
350
- )
351
- logger.info("Fallback summarization model loaded")
352
- except Exception as e2:
353
- logger.error(f"Failed to load fallback model: {e2}")
354
- self.summarizer = None
355
-
356
- def summarize(self, content: str, max_length: int = 300) -> str:
357
- """Generate AI summary of content"""
358
- if not self.summarizer:
359
- return self._extractive_summary(content)
360
 
 
 
 
 
 
 
 
 
 
361
  try:
362
- # Split content into chunks if too long
363
- max_input_length = 1024
364
- chunks = self._split_content(content, max_input_length)
365
-
366
- summaries = []
367
- for chunk in chunks:
368
- if len(chunk.split()) < 20: # Skip very short chunks
369
- continue
370
-
371
- result = self.summarizer(
372
- chunk,
373
- max_length=min(max_length, len(chunk.split()) // 2),
374
- min_length=30,
375
- do_sample=False
376
- )
377
- summaries.append(result[0]['summary_text'])
378
-
379
- # Combine summaries
380
- combined = ' '.join(summaries)
381
-
382
- # If still too long, summarize again
383
- if len(combined.split()) > max_length:
384
- result = self.summarizer(
385
- combined,
386
- max_length=max_length,
387
- min_length=50,
388
- do_sample=False
389
- )
390
- return result[0]['summary_text']
391
-
392
- return combined
393
 
394
  except Exception as e:
395
- logger.error(f"AI summarization failed: {e}")
396
- return self._extractive_summary(content)
397
-
398
- def _split_content(self, content: str, max_length: int) -> List[str]:
399
- """Split content into manageable chunks"""
400
- sentences = sent_tokenize(content)
401
- chunks = []
402
- current_chunk = []
403
- current_length = 0
404
-
405
- for sentence in sentences:
406
- sentence_length = len(sentence.split())
407
- if current_length + sentence_length > max_length and current_chunk:
408
- chunks.append(' '.join(current_chunk))
409
- current_chunk = [sentence]
410
- current_length = sentence_length
411
- else:
412
- current_chunk.append(sentence)
413
- current_length += sentence_length
414
 
415
- if current_chunk:
416
- chunks.append(' '.join(current_chunk))
 
417
 
418
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
- def _extractive_summary(self, content: str) -> str:
421
- """Fallback extractive summarization"""
422
- sentences = sent_tokenize(content)
423
- if len(sentences) <= 3:
424
- return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
- # Simple extractive approach: take first, middle, and last sentences
427
- summary_sentences = [
428
- sentences[0],
429
- sentences[len(sentences) // 2],
430
- sentences[-1]
431
- ]
 
 
 
 
432
 
433
- return ' '.join(summary_sentences)
 
 
 
 
 
434
 
435
- class WebScraperApp:
436
- """Main application class"""
437
 
438
  def __init__(self):
439
- self.extractor = ContentExtractor()
440
- self.summarizer = AISummarizer()
441
- self.scraped_data = []
 
 
 
 
 
 
442
 
443
- def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]:
444
- """Process a single URL and return results"""
 
445
  try:
446
- if not url.strip():
447
- return "❌ Error", "Please enter a valid URL", "", ""
448
-
449
- # Add protocol if missing
450
- if not url.startswith(('http://', 'https://')):
451
- url = 'https://' + url
452
-
453
- # Extract content
454
- with gr.update(): # Show progress
455
- scraped_content = self.extractor.extract_content(url)
456
-
457
- # Generate summary
458
- summary = self.summarizer.summarize(scraped_content.content, summary_length)
459
- scraped_content.summary = summary
460
-
461
- # Store result
462
- self.scraped_data.append(scraped_content)
463
-
464
- # Format results
465
- metadata = f"""
466
- **📊 Content Analysis**
467
- - **Title:** {scraped_content.title}
468
- - **Author:** {scraped_content.author or 'Not found'}
469
- - **Published:** {scraped_content.publish_date or 'Not found'}
470
- - **Word Count:** {scraped_content.word_count:,}
471
- - **Reading Time:** {scraped_content.reading_time} minutes
472
- - **Extracted:** {scraped_content.extracted_at}
473
- """
474
 
475
- keywords_text = f"**🏷️ Keywords:** {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
- return (
478
- "✅ Success",
479
- metadata,
480
- f"**📝 AI Summary ({len(summary.split())} words):**\n\n{summary}",
481
- keywords_text
482
- )
483
 
484
- except Exception as e:
485
- error_msg = f"Failed to process URL: {str(e)}"
486
- logger.error(error_msg)
487
- return "❌ Error", error_msg, "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
489
- def export_data(self, format_type: str) -> str:
490
- """Export scraped data to file"""
491
- if not self.scraped_data:
492
- return "No data to export"
493
 
494
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
496
 
497
- if format_type == "CSV":
498
- filename = f"scraped_data_{timestamp}.csv"
499
- df = pd.DataFrame([
500
- {
501
- 'URL': item.url,
502
- 'Title': item.title,
503
- 'Author': item.author,
504
- 'Published': item.publish_date,
505
- 'Word Count': item.word_count,
506
- 'Reading Time': item.reading_time,
507
- 'Summary': item.summary,
508
- 'Keywords': ', '.join(item.keywords) if item.keywords else '',
509
- 'Extracted At': item.extracted_at
510
- }
511
- for item in self.scraped_data
512
- ])
513
- df.to_csv(filename, index=False)
514
-
515
- elif format_type == "JSON":
516
- filename = f"scraped_data_{timestamp}.json"
517
- data = [
518
- {
519
- 'url': item.url,
520
- 'title': item.title,
521
- 'content': item.content,
522
- 'summary': item.summary,
523
- 'metadata': {
524
- 'author': item.author,
525
- 'publish_date': item.publish_date,
526
- 'word_count': item.word_count,
527
- 'reading_time': item.reading_time,
528
- 'keywords': item.keywords,
529
- 'extracted_at': item.extracted_at
530
- }
531
- }
532
- for item in self.scraped_data
533
- ]
534
- with open(filename, 'w', encoding='utf-8') as f:
535
- json.dump(data, f, indent=2, ensure_ascii=False)
536
-
537
- return filename
538
 
539
  except Exception as e:
540
- logger.error(f"Export failed: {e}")
541
- return f"Export failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
- def clear_data(self) -> str:
544
- """Clear all scraped data"""
545
- self.scraped_data.clear()
546
- return "Data cleared successfully"
 
 
 
 
 
 
547
 
548
- def create_interface():
549
- """Create the Gradio interface"""
550
- app = WebScraperApp()
551
 
552
- # Custom CSS for professional appearance
 
 
 
553
  custom_css = """
554
  .gradio-container {
555
- max-width: 1200px;
556
  margin: auto;
 
557
  }
558
- .main-header {
559
- text-align: center;
560
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
561
  color: white;
562
  padding: 2rem;
563
- border-radius: 10px;
564
  margin-bottom: 2rem;
 
 
565
  }
566
- .feature-box {
567
- background: #f8f9fa;
568
- border: 1px solid #e9ecef;
569
- border-radius: 8px;
 
570
  padding: 1.5rem;
571
  margin: 1rem 0;
 
572
  }
573
- .status-success {
574
- color: #28a745;
575
- font-weight: bold;
 
576
  }
577
- .status-error {
578
- color: #dc3545;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  font-weight: bold;
580
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  """
582
 
583
- with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface:
 
 
 
584
 
585
  # Header
586
  gr.HTML("""
587
- <div class="main-header">
588
- <h1>🤖 AI-Powered Web Scraper</h1>
589
- <p>Professional content extraction and summarization for journalists, analysts, and researchers</p>
 
590
  </div>
591
  """)
592
 
593
- # Main interface
594
- with gr.Row():
595
- with gr.Column(scale=2):
596
- # Input section
597
- gr.HTML("<div class='feature-box'><h3>📡 Content Extraction</h3></div>")
 
598
 
599
- url_input = gr.Textbox(
600
- label="Enter URL to scrape",
601
- placeholder="https://example.com/article",
602
- lines=1
603
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  with gr.Row():
606
- summary_length = gr.Slider(
607
- minimum=100,
608
- maximum=500,
609
- value=300,
610
- step=50,
611
- label="Summary Length (words)"
612
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
 
614
- scrape_btn = gr.Button("🚀 Extract & Summarize", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
 
616
- # Results section
617
- gr.HTML("<div class='feature-box'><h3>📊 Results</h3></div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
- status_output = gr.Textbox(label="Status", lines=1, interactive=False)
620
- metadata_output = gr.Markdown(label="Metadata")
621
- summary_output = gr.Markdown(label="AI Summary")
622
- keywords_output = gr.Markdown(label="Keywords")
623
-
624
- with gr.Column(scale=1):
625
- # Export section
626
- gr.HTML("<div class='feature-box'><h3>💾 Export Options</h3></div>")
627
 
628
- export_format = gr.Radio(
629
- choices=["CSV", "JSON"],
630
- label="Export Format",
631
- value="CSV"
632
- )
633
 
634
- export_btn = gr.Button("📥 Export Data", variant="secondary")
635
- export_status = gr.Textbox(label="Export Status", lines=2, interactive=False)
636
 
637
- gr.HTML("<div class='feature-box'><h3>🧹 Data Management</h3></div>")
638
- clear_btn = gr.Button("🗑️ Clear All Data", variant="secondary")
639
- clear_status = gr.Textbox(label="Clear Status", lines=1, interactive=False)
640
-
641
- # Usage instructions
642
- with gr.Accordion("📚 Usage Instructions", open=False):
643
- gr.Markdown("""
644
- ### How to Use This Tool
645
-
646
- 1. **Enter URL**: Paste the URL of the article or webpage you want to analyze
647
- 2. **Adjust Settings**: Set your preferred summary length
648
- 3. **Extract Content**: Click "Extract & Summarize" to process the content
649
- 4. **Review Results**: View the extracted metadata, AI summary, and keywords
650
- 5. **Export Data**: Save your results in CSV or JSON format
651
-
652
- ### Features
653
- - 🛡️ **Security**: Built-in URL validation and robots.txt compliance
654
- - 🤖 **AI Summarization**: Advanced BART model for intelligent summarization
655
- - 📊 **Rich Metadata**: Author, publication date, reading time, and more
656
- - 🏷️ **Keyword Extraction**: Automatic identification of key terms
657
- - 💾 **Export Options**: CSV and JSON formats for further analysis
658
- - 🔄 **Batch Processing**: Process multiple URLs and export all results
659
-
660
- ### Supported Content
661
- - News articles and blog posts
662
- - Research papers and reports
663
- - Documentation and guides
664
- - Most HTML-based content
665
-
666
- ### Limitations
667
- - Respects robots.txt restrictions
668
- - Cannot access password-protected content
669
- - Some dynamic content may not be captured
670
- - Processing time varies with content length
671
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
 
673
- # Event handlers
674
  scrape_btn.click(
675
- fn=app.process_url,
676
- inputs=[url_input, summary_length],
677
- outputs=[status_output, metadata_output, summary_output, keywords_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
678
  )
679
 
680
  export_btn.click(
681
- fn=app.export_data,
682
- inputs=[export_format],
683
- outputs=[export_status]
684
  )
685
 
686
- clear_btn.click(
687
- fn=app.clear_data,
688
- outputs=[clear_status]
 
 
689
  )
690
 
691
  return interface
692
 
693
  # Launch the application
694
  if __name__ == "__main__":
695
- interface = create_interface()
696
- interface.launch(
697
- server_name="0.0.0.0",
698
- server_port=7860,
699
- share=False,
700
- show_error=True
701
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ AI Dataset Studio - Modern Web Scraping & Dataset Creation Platform
3
+ A mini Scale AI for non-coders and vibe coders
4
+
5
+ Features:
6
+ - Intelligent web scraping with content extraction
7
+ - Automated data cleaning and preprocessing
8
+ - Interactive annotation tools
9
+ - Template-based workflows for common ML tasks
10
+ - High-quality dataset generation
11
+ - Export to HuggingFace Hub and popular ML formats
12
+ - Visual data quality metrics
13
+ - No-code dataset creation workflows
14
  """
15
 
16
  import gradio as gr
 
 
 
17
  import pandas as pd
18
+ import numpy as np
19
  import json
20
  import re
21
+ import requests
22
+ from bs4 import BeautifulSoup
23
+ from urllib.parse import urlparse, urljoin
24
+ from datetime import datetime, timedelta
25
  import logging
26
+ from typing import Dict, List, Tuple, Optional, Any
27
+ from dataclasses import dataclass, asdict
28
  from pathlib import Path
29
+ import uuid
 
 
 
 
 
 
 
30
  import hashlib
31
+ import time
32
+ from collections import defaultdict
33
+ import io
34
+ import zipfile
35
+
36
+ # Optional imports with fallbacks
37
+ try:
38
+ from transformers import pipeline, AutoTokenizer, AutoModel
39
+ from sentence_transformers import SentenceTransformer
40
+ HAS_TRANSFORMERS = True
41
+ except ImportError:
42
+ HAS_TRANSFORMERS = False
43
 
 
44
  try:
45
+ import nltk
46
+ from nltk.tokenize import sent_tokenize, word_tokenize
47
+ from nltk.corpus import stopwords
48
+ HAS_NLTK = True
49
+ except ImportError:
50
+ HAS_NLTK = False
51
+
52
+ try:
53
+ from datasets import Dataset, DatasetDict
54
+ HAS_DATASETS = True
55
+ except ImportError:
56
+ HAS_DATASETS = False
57
 
58
  # Configure logging
59
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
60
  logger = logging.getLogger(__name__)
61
 
62
+ # Download NLTK data if available
63
+ if HAS_NLTK:
64
+ try:
65
+ nltk.download('punkt', quiet=True)
66
+ nltk.download('stopwords', quiet=True)
67
+ nltk.download('averaged_perceptron_tagger', quiet=True)
68
+ except:
69
+ pass
70
+
71
  @dataclass
72
+ class ScrapedItem:
73
+ """Data class for scraped content"""
74
+ id: str
75
  url: str
76
  title: str
77
  content: str
78
+ metadata: Dict[str, Any]
79
+ scraped_at: str
80
  word_count: int
81
+ language: str = "en"
82
+ quality_score: float = 0.0
83
+ labels: List[str] = None
84
+ annotations: Dict[str, Any] = None
 
 
85
 
86
+ def __post_init__(self):
87
+ if self.labels is None:
88
+ self.labels = []
89
+ if self.annotations is None:
90
+ self.annotations = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ @dataclass
93
+ class DatasetTemplate:
94
+ """Template for dataset creation"""
95
+ name: str
96
+ description: str
97
+ task_type: str # classification, ner, qa, summarization, etc.
98
+ required_fields: List[str]
99
+ optional_fields: List[str]
100
+ example_format: Dict[str, Any]
101
+ instructions: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ class WebScraperEngine:
104
+ """Advanced web scraping engine with smart content extraction"""
105
 
106
  def __init__(self):
107
  self.session = requests.Session()
108
  self.session.headers.update({
109
+ 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0; Research)',
110
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
111
  'Accept-Language': 'en-US,en;q=0.5',
112
  'Accept-Encoding': 'gzip, deflate',
113
  'Connection': 'keep-alive',
 
114
  })
115
+
116
+ # Initialize AI models if available
117
+ self.content_classifier = None
118
+ self.quality_scorer = None
119
+ self._load_models()
120
 
121
+ def _load_models(self):
122
+ """Load AI models for content analysis"""
123
+ if not HAS_TRANSFORMERS:
124
+ logger.warning("⚠️ Transformers not available, using rule-based methods")
125
+ return
126
+
127
  try:
128
+ # Content quality assessment
129
+ self.quality_scorer = pipeline(
130
+ "text-classification",
131
+ model="martin-ha/toxic-comment-model",
132
+ return_all_scores=True
133
+ )
134
+ logger.info("✅ Quality assessment model loaded")
135
+ except Exception as e:
136
+ logger.warning(f"⚠️ Could not load quality model: {e}")
137
+
138
+ def scrape_url(self, url: str) -> Optional[ScrapedItem]:
139
+ """Scrape a single URL and return structured data"""
140
+ try:
141
+ # Validate URL
142
+ if not self._is_valid_url(url):
143
+ raise ValueError("Invalid URL provided")
144
 
145
  # Fetch content
146
  response = self.session.get(url, timeout=15)
 
149
  # Parse HTML
150
  soup = BeautifulSoup(response.content, 'html.parser')
151
 
152
+ # Extract structured data
153
  title = self._extract_title(soup)
154
+ content = self._extract_content(soup)
155
+ metadata = self._extract_metadata(soup, response)
 
 
 
 
 
 
 
156
 
157
+ # Create scraped item
158
+ item = ScrapedItem(
159
+ id=str(uuid.uuid4()),
 
 
 
 
 
160
  url=url,
161
  title=title,
162
  content=content,
163
+ metadata=metadata,
164
+ scraped_at=datetime.now().isoformat(),
165
+ word_count=len(content.split()),
166
+ quality_score=self._assess_quality(content)
 
 
 
 
167
  )
168
 
169
+ return item
170
+
171
  except Exception as e:
172
+ logger.error(f"Failed to scrape {url}: {e}")
173
+ return None
174
 
175
+ def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
176
+ """Scrape multiple URLs with progress tracking"""
177
+ results = []
178
+ total = len(urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ for i, url in enumerate(urls):
181
+ if progress_callback:
182
+ progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")
183
+
184
+ item = self.scrape_url(url)
185
+ if item:
186
+ results.append(item)
187
+
188
+ # Rate limiting
189
+ time.sleep(1)
190
 
191
+ return results
192
+
193
+ def _is_valid_url(self, url: str) -> bool:
194
+ """Validate URL format and safety"""
195
+ try:
196
+ parsed = urlparse(url)
197
+ return parsed.scheme in ['http', 'https'] and parsed.netloc
198
+ except:
199
+ return False
200
 
201
+ def _extract_title(self, soup: BeautifulSoup) -> str:
202
+ """Extract page title"""
203
+ # Try multiple selectors
204
+ selectors = [
205
+ 'meta[property="og:title"]',
206
+ 'meta[name="twitter:title"]',
207
+ 'title',
208
+ 'h1'
209
  ]
210
 
211
+ for selector in selectors:
212
  element = soup.select_one(selector)
213
  if element:
214
  if element.name == 'meta':
215
  return element.get('content', '').strip()
 
 
216
  else:
217
  return element.get_text().strip()
218
 
219
+ return "Untitled"
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ def _extract_content(self, soup: BeautifulSoup) -> str:
222
+ """Extract main content using multiple strategies"""
223
  # Remove unwanted elements
224
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
 
225
  element.decompose()
226
 
227
+ # Try content-specific selectors
228
  content_selectors = [
229
  'article',
230
  'main',
 
232
  '.post-content',
233
  '.entry-content',
234
  '.article-body',
235
+ '[role="main"]'
 
236
  ]
237
 
238
  for selector in content_selectors:
239
  element = soup.select_one(selector)
240
  if element:
241
  text = element.get_text(separator=' ', strip=True)
242
+ if len(text) > 200:
243
  return self._clean_text(text)
244
 
245
+ # Fallback to body
246
  body = soup.find('body')
247
  if body:
248
+ return self._clean_text(body.get_text(separator=' ', strip=True))
 
249
 
 
250
  return self._clean_text(soup.get_text(separator=' ', strip=True))
251
 
252
+ def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
253
+ """Extract metadata from page"""
254
+ metadata = {
255
+ 'domain': urlparse(response.url).netloc,
256
+ 'status_code': response.status_code,
257
+ 'content_type': response.headers.get('content-type', ''),
258
+ 'extracted_at': datetime.now().isoformat()
259
+ }
260
+
261
+ # Extract meta tags
262
+ meta_tags = ['description', 'keywords', 'author', 'published_time']
263
+ for tag in meta_tags:
264
+ element = soup.find('meta', attrs={'name': tag}) or soup.find('meta', attrs={'property': f'article:{tag}'})
265
+ if element:
266
+ metadata[tag] = element.get('content', '')
267
+
268
+ return metadata
269
+
270
  def _clean_text(self, text: str) -> str:
271
  """Clean extracted text"""
272
  # Remove extra whitespace
273
  text = re.sub(r'\s+', ' ', text)
274
 
275
+ # Remove common patterns
276
+ patterns = [
277
+ r'Subscribe.*?newsletter',
278
+ r'Click here.*?more',
279
+ r'Advertisement',
280
+ r'Share this.*?social',
281
+ r'Follow us on.*?media'
282
+ ]
283
+
284
+ for pattern in patterns:
285
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE)
286
 
287
  return text.strip()
288
 
289
+ def _assess_quality(self, content: str) -> float:
290
+ """Assess content quality (0-1 score)"""
291
+ if not content:
292
+ return 0.0
293
+
294
+ score = 0.0
295
 
296
+ # Length check
297
+ word_count = len(content.split())
298
+ if word_count >= 50:
299
+ score += 0.3
300
+ elif word_count >= 20:
301
+ score += 0.1
302
 
303
+ # Structure check (sentences)
304
+ sentence_count = len(re.split(r'[.!?]+', content))
305
+ if sentence_count >= 3:
306
+ score += 0.2
307
+
308
+ # Language quality (basic)
309
+ if re.search(r'[A-Z][a-z]+', content): # Proper capitalization
310
+ score += 0.2
311
+
312
+ if not re.search(r'[^\w\s]', content[:100]): # No weird characters at start
313
+ score += 0.1
314
+
315
+ # Readability (simple check)
316
+ avg_word_length = np.mean([len(word) for word in content.split()])
317
+ if 3 <= avg_word_length <= 8:
318
+ score += 0.2
319
+
320
+ return min(score, 1.0)
321
 
322
+ class DataProcessor:
323
+ """Advanced data processing and cleaning pipeline"""
324
 
325
  def __init__(self):
326
+ self.language_detector = None
327
+ self.sentiment_analyzer = None
328
+ self.ner_model = None
329
+ self._load_models()
330
 
331
+ def _load_models(self):
332
+ """Load NLP models for processing"""
333
+ if not HAS_TRANSFORMERS:
334
+ return
335
+
336
  try:
337
+ # Sentiment analysis
338
+ self.sentiment_analyzer = pipeline(
339
+ "sentiment-analysis",
340
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest"
341
+ )
342
+
343
+ # Named Entity Recognition
344
+ self.ner_model = pipeline(
345
+ "ner",
346
+ model="dbmdz/bert-large-cased-finetuned-conll03-english",
347
+ aggregation_strategy="simple"
348
  )
349
+
350
+ logger.info("✅ NLP models loaded successfully")
351
  except Exception as e:
352
+ logger.warning(f"⚠️ Could not load NLP models: {e}")
353
+
354
+ def process_items(self, items: List[ScrapedItem], processing_options: Dict[str, bool]) -> List[ScrapedItem]:
355
+ """Process scraped items with various enhancement options"""
356
+ processed_items = []
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ for item in items:
359
+ processed_item = self._process_single_item(item, processing_options)
360
+ if processed_item:
361
+ processed_items.append(processed_item)
362
+
363
+ return processed_items
364
+
365
+ def _process_single_item(self, item: ScrapedItem, options: Dict[str, bool]) -> Optional[ScrapedItem]:
366
+ """Process a single item"""
367
  try:
368
+ # Clean content
369
+ if options.get('clean_text', True):
370
+ item.content = self._clean_text_advanced(item.content)
371
+
372
+ # Filter by quality
373
+ if options.get('quality_filter', True) and item.quality_score < 0.3:
374
+ return None
375
+
376
+ # Add sentiment analysis
377
+ if options.get('add_sentiment', False) and self.sentiment_analyzer:
378
+ sentiment = self._analyze_sentiment(item.content)
379
+ item.metadata['sentiment'] = sentiment
380
+
381
+ # Add named entities
382
+ if options.get('extract_entities', False) and self.ner_model:
383
+ entities = self._extract_entities(item.content)
384
+ item.metadata['entities'] = entities
385
+
386
+ # Add language detection
387
+ if options.get('detect_language', True):
388
+ item.language = self._detect_language(item.content)
389
+
390
+ return item
 
 
 
 
 
 
 
 
391
 
392
  except Exception as e:
393
+ logger.error(f"Error processing item {item.id}: {e}")
394
+ return None
395
+
396
+ def _clean_text_advanced(self, text: str) -> str:
397
+ """Advanced text cleaning"""
398
+ # Remove URLs
399
+ text = re.sub(r'http\S+|www\.\S+', '', text)
400
+
401
+ # Remove email addresses
402
+ text = re.sub(r'\S+@\S+', '', text)
403
+
404
+ # Remove excessive punctuation
405
+ text = re.sub(r'[!?]{2,}', '!', text)
406
+ text = re.sub(r'\.{3,}', '...', text)
407
+
408
+ # Normalize whitespace
409
+ text = re.sub(r'\s+', ' ', text)
 
 
410
 
411
+ # Remove very short paragraphs (likely navigation)
412
+ paragraphs = text.split('\n')
413
+ paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 20]
414
 
415
+ return '\n'.join(paragraphs).strip()
416
+
417
+ def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
418
+ """Analyze sentiment of text"""
419
+ try:
420
+ # Truncate text for model limits
421
+ text_sample = text[:512]
422
+ result = self.sentiment_analyzer(text_sample)[0]
423
+ return {
424
+ 'label': result['label'],
425
+ 'score': result['score']
426
+ }
427
+ except:
428
+ return {'label': 'UNKNOWN', 'score': 0.0}
429
+
430
+ def _extract_entities(self, text: str) -> List[Dict[str, Any]]:
431
+ """Extract named entities"""
432
+ try:
433
+ # Truncate text for model limits
434
+ text_sample = text[:512]
435
+ entities = self.ner_model(text_sample)
436
+ return [
437
+ {
438
+ 'text': ent['word'],
439
+ 'label': ent['entity_group'],
440
+ 'confidence': ent['score']
441
+ }
442
+ for ent in entities
443
+ ]
444
+ except:
445
+ return []
446
+
447
+ def _detect_language(self, text: str) -> str:
448
+ """Simple language detection"""
449
+ # Basic heuristic - could be enhanced with proper language detection
450
+ if re.search(r'[а-яё]', text.lower()):
451
+ return 'ru'
452
+ elif re.search(r'[ñáéíóúü]', text.lower()):
453
+ return 'es'
454
+ elif re.search(r'[àâäçéèêëïîôöùûüÿ]', text.lower()):
455
+ return 'fr'
456
+ else:
457
+ return 'en'
458
+
459
+ class AnnotationEngine:
460
+ """Interactive annotation tools for dataset creation"""
461
+
462
+ def __init__(self):
463
+ self.templates = self._load_templates()
464
 
465
+ def _load_templates(self) -> Dict[str, DatasetTemplate]:
466
+ """Load predefined dataset templates"""
467
+ templates = {
468
+ 'text_classification': DatasetTemplate(
469
+ name="Text Classification",
470
+ description="Classify text into predefined categories",
471
+ task_type="classification",
472
+ required_fields=["text", "label"],
473
+ optional_fields=["confidence", "metadata"],
474
+ example_format={"text": "Sample text", "label": "positive"},
475
+ instructions="Label each text with the appropriate category"
476
+ ),
477
+ 'sentiment_analysis': DatasetTemplate(
478
+ name="Sentiment Analysis",
479
+ description="Analyze emotional tone of text",
480
+ task_type="classification",
481
+ required_fields=["text", "sentiment"],
482
+ optional_fields=["confidence", "aspects"],
483
+ example_format={"text": "I love this!", "sentiment": "positive"},
484
+ instructions="Classify the sentiment as positive, negative, or neutral"
485
+ ),
486
+ 'named_entity_recognition': DatasetTemplate(
487
+ name="Named Entity Recognition",
488
+ description="Identify and classify named entities in text",
489
+ task_type="ner",
490
+ required_fields=["text", "entities"],
491
+ optional_fields=["metadata"],
492
+ example_format={
493
+ "text": "John works at OpenAI in San Francisco",
494
+ "entities": [
495
+ {"text": "John", "label": "PERSON", "start": 0, "end": 4},
496
+ {"text": "OpenAI", "label": "ORG", "start": 14, "end": 20}
497
+ ]
498
+ },
499
+ instructions="Mark all named entities (people, organizations, locations, etc.)"
500
+ ),
501
+ 'question_answering': DatasetTemplate(
502
+ name="Question Answering",
503
+ description="Create question-answer pairs from text",
504
+ task_type="qa",
505
+ required_fields=["context", "question", "answer"],
506
+ optional_fields=["answer_start", "metadata"],
507
+ example_format={
508
+ "context": "The capital of France is Paris.",
509
+ "question": "What is the capital of France?",
510
+ "answer": "Paris"
511
+ },
512
+ instructions="Create meaningful questions and provide accurate answers"
513
+ ),
514
+ 'summarization': DatasetTemplate(
515
+ name="Text Summarization",
516
+ description="Create concise summaries of longer texts",
517
+ task_type="summarization",
518
+ required_fields=["text", "summary"],
519
+ optional_fields=["summary_type", "length"],
520
+ example_format={
521
+ "text": "Long article text...",
522
+ "summary": "Brief summary of the main points"
523
+ },
524
+ instructions="Write clear, concise summaries capturing key information"
525
+ )
526
+ }
527
+ return templates
528
+
529
+ def create_annotation_interface(self, template_name: str, items: List[ScrapedItem]) -> Dict[str, Any]:
530
+ """Create annotation interface for specific template"""
531
+ template = self.templates.get(template_name)
532
+ if not template:
533
+ raise ValueError(f"Unknown template: {template_name}")
534
 
535
+ # Prepare data for annotation
536
+ annotation_data = []
537
+ for item in items:
538
+ annotation_data.append({
539
+ 'id': item.id,
540
+ 'text': item.content[:1000], # Truncate for UI
541
+ 'title': item.title,
542
+ 'url': item.url,
543
+ 'annotations': {}
544
+ })
545
 
546
+ return {
547
+ 'template': template,
548
+ 'data': annotation_data,
549
+ 'progress': 0,
550
+ 'completed': 0
551
+ }
552
 
553
+ class DatasetExporter:
554
+ """Export datasets in various formats for ML frameworks"""
555
 
556
  def __init__(self):
557
+ self.supported_formats = [
558
+ 'huggingface_datasets',
559
+ 'json',
560
+ 'csv',
561
+ 'parquet',
562
+ 'jsonl',
563
+ 'pytorch',
564
+ 'tensorflow'
565
+ ]
566
 
567
+ def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
568
+ export_format: str, annotations: Dict[str, Any] = None) -> str:
569
+ """Export annotated dataset in specified format"""
570
  try:
571
+ # Prepare dataset
572
+ dataset_data = self._prepare_dataset_data(items, template, annotations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
+ # Export based on format
575
+ if export_format == 'huggingface_datasets':
576
+ return self._export_huggingface(dataset_data, template)
577
+ elif export_format == 'json':
578
+ return self._export_json(dataset_data)
579
+ elif export_format == 'csv':
580
+ return self._export_csv(dataset_data)
581
+ elif export_format == 'jsonl':
582
+ return self._export_jsonl(dataset_data)
583
+ else:
584
+ raise ValueError(f"Unsupported format: {export_format}")
585
+
586
+ except Exception as e:
587
+ logger.error(f"Export failed: {e}")
588
+ raise
589
+
590
+ def _prepare_dataset_data(self, items: List[ScrapedItem], template: DatasetTemplate,
591
+ annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
592
+ """Prepare data according to template format"""
593
+ dataset_data = []
594
+
595
+ for item in items:
596
+ # Base data from scraped item
597
+ data_point = {
598
+ 'text': item.content,
599
+ 'title': item.title,
600
+ 'url': item.url,
601
+ 'metadata': item.metadata
602
+ }
603
 
604
+ # Add annotations if available
605
+ if annotations and item.id in annotations:
606
+ item_annotations = annotations[item.id]
607
+ data_point.update(item_annotations)
 
 
608
 
609
+ # Format according to template
610
+ formatted_point = self._format_for_template(data_point, template)
611
+ if formatted_point:
612
+ dataset_data.append(formatted_point)
613
+
614
+ return dataset_data
615
+
616
+ def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
617
+ """Format data point according to template requirements"""
618
+ formatted = {}
619
+
620
+ # Ensure required fields are present
621
+ for field in template.required_fields:
622
+ if field in data_point:
623
+ formatted[field] = data_point[field]
624
+ elif field == 'text' and 'content' in data_point:
625
+ formatted[field] = data_point['content']
626
+ else:
627
+ # Skip this data point if required field is missing
628
+ return None
629
+
630
+ # Add optional fields if present
631
+ for field in template.optional_fields:
632
+ if field in data_point:
633
+ formatted[field] = data_point[field]
634
+
635
+ return formatted
636
 
637
+ def _export_huggingface(self, dataset_data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
638
+ """Export as HuggingFace Dataset"""
639
+ if not HAS_DATASETS:
640
+ raise ImportError("datasets library not available")
641
 
642
  try:
643
+ # Create dataset
644
+ dataset = Dataset.from_list(dataset_data)
645
+
646
+ # Create dataset card
647
+ card_content = f"""
648
+ # {template.name} Dataset
649
+
650
+ ## Description
651
+ {template.description}
652
+
653
+ ## Task Type
654
+ {template.task_type}
655
+
656
+ ## Format
657
+ {template.example_format}
658
+
659
+ ## Instructions
660
+ {template.instructions}
661
+
662
+ ## Statistics
663
+ - Total samples: {len(dataset_data)}
664
+ - Created: {datetime.now().isoformat()}
665
+
666
+ ## Usage
667
+ ```python
668
+ from datasets import load_dataset
669
+ dataset = load_dataset('path/to/dataset')
670
+ ```
671
+ """
672
+
673
+ # Save dataset
674
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
675
+ dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
676
 
677
+ # Save locally (would push to Hub in production)
678
+ dataset.save_to_disk(dataset_name)
679
+
680
+ # Create info file
681
+ with open(f"{dataset_name}/README.md", "w") as f:
682
+ f.write(card_content)
683
+
684
+ return dataset_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  except Exception as e:
687
+ logger.error(f"HuggingFace export failed: {e}")
688
+ raise
689
+
690
+ def _export_json(self, dataset_data: List[Dict[str, Any]]) -> str:
691
+ """Export as JSON file"""
692
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
693
+ filename = f"dataset_{timestamp}.json"
694
+
695
+ with open(filename, 'w', encoding='utf-8') as f:
696
+ json.dump(dataset_data, f, indent=2, ensure_ascii=False)
697
+
698
+ return filename
699
+
700
+ def _export_csv(self, dataset_data: List[Dict[str, Any]]) -> str:
701
+ """Export as CSV file"""
702
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
703
+ filename = f"dataset_{timestamp}.csv"
704
+
705
+ df = pd.DataFrame(dataset_data)
706
+ df.to_csv(filename, index=False)
707
+
708
+ return filename
709
 
710
+ def _export_jsonl(self, dataset_data: List[Dict[str, Any]]) -> str:
711
+ """Export as JSONL file"""
712
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
713
+ filename = f"dataset_{timestamp}.jsonl"
714
+
715
+ with open(filename, 'w', encoding='utf-8') as f:
716
+ for item in dataset_data:
717
+ f.write(json.dumps(item, ensure_ascii=False) + '\n')
718
+
719
+ return filename
720
 
721
+ def create_modern_interface():
722
+ """Create modern, intuitive interface for AI Dataset Studio"""
 
723
 
724
+ # Initialize the studio
725
+ studio = DatasetStudio()
726
+
727
+ # Custom CSS for modern appearance
728
  custom_css = """
729
  .gradio-container {
730
+ max-width: 1400px;
731
  margin: auto;
732
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
733
  }
734
+
735
+ .studio-header {
736
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
737
  color: white;
738
  padding: 2rem;
739
+ border-radius: 15px;
740
  margin-bottom: 2rem;
741
+ text-align: center;
742
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
743
  }
744
+
745
+ .workflow-card {
746
+ background: #f8f9ff;
747
+ border: 2px solid #e1e5ff;
748
+ border-radius: 12px;
749
  padding: 1.5rem;
750
  margin: 1rem 0;
751
+ transition: all 0.3s ease;
752
  }
753
+
754
+ .workflow-card:hover {
755
+ border-color: #667eea;
756
+ box-shadow: 0 4px 20px rgba(102, 126, 234, 0.1);
757
  }
758
+
759
+ .step-header {
760
+ display: flex;
761
+ align-items: center;
762
+ margin-bottom: 1rem;
763
+ font-size: 1.2em;
764
+ font-weight: 600;
765
+ color: #4c51bf;
766
+ }
767
+
768
+ .step-number {
769
+ background: #667eea;
770
+ color: white;
771
+ border-radius: 50%;
772
+ width: 30px;
773
+ height: 30px;
774
+ display: flex;
775
+ align-items: center;
776
+ justify-content: center;
777
+ margin-right: 1rem;
778
  font-weight: bold;
779
  }
780
+
781
+ .feature-grid {
782
+ display: grid;
783
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
784
+ gap: 1rem;
785
+ margin: 1rem 0;
786
+ }
787
+
788
+ .feature-item {
789
+ background: white;
790
+ border: 1px solid #e2e8f0;
791
+ border-radius: 8px;
792
+ padding: 1rem;
793
+ text-align: center;
794
+ }
795
+
796
+ .stat-card {
797
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
798
+ color: white;
799
+ padding: 1rem;
800
+ border-radius: 10px;
801
+ text-align: center;
802
+ margin: 0.5rem;
803
+ }
804
+
805
+ .progress-bar {
806
+ background: #e2e8f0;
807
+ border-radius: 10px;
808
+ height: 8px;
809
+ overflow: hidden;
810
+ }
811
+
812
+ .progress-fill {
813
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
814
+ height: 100%;
815
+ transition: width 0.3s ease;
816
+ }
817
+
818
+ .template-card {
819
+ border: 2px solid #e2e8f0;
820
+ border-radius: 10px;
821
+ padding: 1rem;
822
+ margin: 0.5rem;
823
+ cursor: pointer;
824
+ transition: all 0.3s ease;
825
+ }
826
+
827
+ .template-card:hover {
828
+ border-color: #667eea;
829
+ transform: translateY(-2px);
830
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
831
+ }
832
+
833
+ .template-selected {
834
+ border-color: #667eea;
835
+ background: #f7fafc;
836
+ }
837
+
838
+ .export-option {
839
+ background: #f7fafc;
840
+ border: 1px solid #e2e8f0;
841
+ border-radius: 8px;
842
+ padding: 1rem;
843
+ margin: 0.5rem 0;
844
+ cursor: pointer;
845
+ }
846
+
847
+ .export-option:hover {
848
+ background: #edf2f7;
849
+ border-color: #cbd5e0;
850
+ }
851
+
852
+ .success-message {
853
+ background: #f0fff4;
854
+ border: 1px solid #9ae6b4;
855
+ color: #276749;
856
+ padding: 1rem;
857
+ border-radius: 8px;
858
+ margin: 1rem 0;
859
+ }
860
+
861
+ .error-message {
862
+ background: #fed7d7;
863
+ border: 1px solid #feb2b2;
864
+ color: #c53030;
865
+ padding: 1rem;
866
+ border-radius: 8px;
867
+ margin: 1rem 0;
868
+ }
869
  """
870
 
871
+ # Project state for UI
872
+ project_state = gr.State({})
873
+
874
+ with gr.Blocks(css=custom_css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
875
 
876
  # Header
877
  gr.HTML("""
878
+ <div class="studio-header">
879
+ <h1>🚀 AI Dataset Studio</h1>
880
+ <p>Create high-quality training datasets without coding - Your personal Scale AI</p>
881
+ <p style="opacity: 0.9; font-size: 0.9em;">Web Scraping → Data Processing → Annotation → ML-Ready Datasets</p>
882
  </div>
883
  """)
884
 
885
+ # Main workflow tabs
886
+ with gr.Tabs() as main_tabs:
887
+
888
+ # Tab 1: Project Setup
889
+ with gr.Tab("🎯 Project Setup", id="setup"):
890
+ gr.HTML('<div class="step-header"><div class="step-number">1</div>Start Your Dataset Project</div>')
891
 
892
+ with gr.Row():
893
+ with gr.Column(scale=2):
894
+ gr.HTML("""
895
+ <div class="workflow-card">
896
+ <h3>📋 Project Configuration</h3>
897
+ <p>Define your dataset project and choose the type of AI task you're building for.</p>
898
+ </div>
899
+ """)
900
+
901
+ project_name = gr.Textbox(
902
+ label="Project Name",
903
+ placeholder="e.g., 'News Sentiment Analysis' or 'Product Review Classification'",
904
+ value="My Dataset Project"
905
+ )
906
+
907
+ # Template selection with visual cards
908
+ gr.HTML("<h4>🎨 Choose Your Dataset Template</h4>")
909
+
910
+ template_choice = gr.Radio(
911
+ choices=[
912
+ ("📊 Text Classification", "text_classification"),
913
+ ("😊 Sentiment Analysis", "sentiment_analysis"),
914
+ ("👥 Named Entity Recognition", "named_entity_recognition"),
915
+ ("❓ Question Answering", "question_answering"),
916
+ ("📝 Text Summarization", "summarization")
917
+ ],
918
+ label="Dataset Type",
919
+ value="text_classification",
920
+ interactive=True
921
+ )
922
+
923
+ create_project_btn = gr.Button(
924
+ "🚀 Create Project",
925
+ variant="primary",
926
+ size="lg"
927
+ )
928
+
929
+ project_status = gr.Markdown("")
930
+
931
+ with gr.Column(scale=1):
932
+ gr.HTML("""
933
+ <div class="workflow-card">
934
+ <h3>💡 Template Guide</h3>
935
+ <div class="feature-grid">
936
+ <div class="feature-item">
937
+ <h4>📊 Text Classification</h4>
938
+ <p>Categorize text into predefined labels</p>
939
+ <small>Great for: Spam detection, topic classification</small>
940
+ </div>
941
+ <div class="feature-item">
942
+ <h4>😊 Sentiment Analysis</h4>
943
+ <p>Analyze emotional tone and opinions</p>
944
+ <small>Great for: Review analysis, social media monitoring</small>
945
+ </div>
946
+ <div class="feature-item">
947
+ <h4>👥 Named Entity Recognition</h4>
948
+ <p>Identify people, places, organizations</p>
949
+ <small>Great for: Information extraction, content tagging</small>
950
+ </div>
951
+ </div>
952
+ </div>
953
+ """)
954
+
955
+ # Tab 2: Data Collection
956
+ with gr.Tab("🕷️ Data Collection", id="collection"):
957
+ gr.HTML('<div class="step-header"><div class="step-number">2</div>Collect Your Data</div>')
958
+
959
+ with gr.Row():
960
+ with gr.Column(scale=2):
961
+ gr.HTML("""
962
+ <div class="workflow-card">
963
+ <h3>🌐 Web Scraping</h3>
964
+ <p>Provide URLs to scrape content automatically. Our AI will extract clean, structured text.</p>
965
+ </div>
966
+ """)
967
+
968
+ # URL input methods
969
+ with gr.Tabs():
970
+ with gr.Tab("📝 Manual Input"):
971
+ urls_input = gr.Textbox(
972
+ label="URLs to Scrape",
973
+ placeholder="https://example.com/article1\nhttps://example.com/article2\n...",
974
+ lines=8,
975
+ info="Enter one URL per line"
976
+ )
977
+
978
+ with gr.Tab("📎 File Upload"):
979
+ urls_file = gr.File(
980
+ label="Upload URL List",
981
+ file_types=[".txt", ".csv"],
982
+ info="Upload a text file with URLs (one per line) or CSV with 'url' column"
983
+ )
984
+
985
+ scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg")
986
+
987
+ # Progress tracking
988
+ scraping_progress = gr.Progress()
989
+ scraping_status = gr.Markdown("")
990
+
991
+ with gr.Column(scale=1):
992
+ gr.HTML("""
993
+ <div class="workflow-card">
994
+ <h3>⚡ Features</h3>
995
+ <ul style="list-style: none; padding: 0;">
996
+ <li>✅ Smart content extraction</li>
997
+ <li>✅ Quality scoring</li>
998
+ <li>✅ Duplicate detection</li>
999
+ <li>✅ Security validation</li>
1000
+ <li>✅ Metadata extraction</li>
1001
+ <li>✅ Rate limiting</li>
1002
+ </ul>
1003
+ </div>
1004
+ """)
1005
+
1006
+ # Quick stats
1007
+ collection_stats = gr.HTML("")
1008
+
1009
+ # Tab 3: Data Processing
1010
+ with gr.Tab("⚙️ Data Processing", id="processing"):
1011
+ gr.HTML('<div class="step-header"><div class="step-number">3</div>Clean & Enhance Your Data</div>')
1012
 
1013
  with gr.Row():
1014
+ with gr.Column(scale=2):
1015
+ gr.HTML("""
1016
+ <div class="workflow-card">
1017
+ <h3>🔧 Processing Options</h3>
1018
+ <p>Configure how to clean and enhance your scraped data with AI-powered analysis.</p>
1019
+ </div>
1020
+ """)
1021
+
1022
+ # Processing options
1023
+ with gr.Row():
1024
+ with gr.Column():
1025
+ clean_text = gr.Checkbox(label="🧹 Advanced Text Cleaning", value=True)
1026
+ quality_filter = gr.Checkbox(label="🎯 Quality Filtering", value=True)
1027
+ detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
1028
+
1029
+ with gr.Column():
1030
+ add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
1031
+ extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)
1032
+ deduplicate = gr.Checkbox(label="🔄 Remove Duplicates", value=True)
1033
+
1034
+ process_btn = gr.Button("⚙️ Process Data", variant="primary", size="lg")
1035
+ processing_status = gr.Markdown("")
1036
+
1037
+ with gr.Column(scale=1):
1038
+ gr.HTML("""
1039
+ <div class="workflow-card">
1040
+ <h3>📊 Processing Stats</h3>
1041
+ <div id="processing-stats"></div>
1042
+ </div>
1043
+ """)
1044
+
1045
+ processing_stats = gr.HTML("")
1046
+
1047
+ # Tab 4: Data Preview
1048
+ with gr.Tab("👀 Data Preview", id="preview"):
1049
+ gr.HTML('<div class="step-header"><div class="step-number">4</div>Review Your Dataset</div>')
1050
 
1051
+ with gr.Row():
1052
+ with gr.Column(scale=2):
1053
+ gr.HTML("""
1054
+ <div class="workflow-card">
1055
+ <h3>📋 Dataset Preview</h3>
1056
+ <p>Review your processed data before annotation or export.</p>
1057
+ </div>
1058
+ """)
1059
+
1060
+ refresh_preview_btn = gr.Button("🔄 Refresh Preview", variant="secondary")
1061
+
1062
+ # Data preview table
1063
+ data_preview = gr.DataFrame(
1064
+ headers=["Title", "Content Preview", "Word Count", "Quality Score", "URL"],
1065
+ label="Dataset Preview",
1066
+ interactive=False
1067
+ )
1068
+
1069
+ with gr.Column(scale=1):
1070
+ gr.HTML("""
1071
+ <div class="workflow-card">
1072
+ <h3>📈 Dataset Statistics</h3>
1073
+ </div>
1074
+ """)
1075
+
1076
+ dataset_stats = gr.JSON(label="Statistics")
1077
+
1078
+ # Tab 5: Export
1079
+ with gr.Tab("📤 Export Dataset", id="export"):
1080
+ gr.HTML('<div class="step-header"><div class="step-number">5</div>Export Your Dataset</div>')
1081
 
1082
+ with gr.Row():
1083
+ with gr.Column(scale=2):
1084
+ gr.HTML("""
1085
+ <div class="workflow-card">
1086
+ <h3>💾 Export Options</h3>
1087
+ <p>Export your dataset in various formats for different ML frameworks and platforms.</p>
1088
+ </div>
1089
+ """)
1090
+
1091
+ # Export format selection
1092
+ export_format = gr.Radio(
1093
+ choices=[
1094
+ ("🤗 HuggingFace Datasets", "huggingface_datasets"),
1095
+ ("📄 JSON", "json"),
1096
+ ("📊 CSV", "csv"),
1097
+ ("📋 JSONL", "jsonl"),
1098
+ ("⚡ Parquet", "parquet")
1099
+ ],
1100
+ label="Export Format",
1101
+ value="json"
1102
+ )
1103
+
1104
+ # Template for export
1105
+ export_template = gr.Dropdown(
1106
+ choices=[
1107
+ "text_classification",
1108
+ "sentiment_analysis",
1109
+ "named_entity_recognition",
1110
+ "question_answering",
1111
+ "summarization"
1112
+ ],
1113
+ label="Dataset Template",
1114
+ value="text_classification"
1115
+ )
1116
+
1117
+ export_btn = gr.Button("📤 Export Dataset", variant="primary", size="lg")
1118
+
1119
+ # Export results
1120
+ export_status = gr.Markdown("")
1121
+ export_file = gr.File(label="Download Dataset", visible=False)
1122
+
1123
+ with gr.Column(scale=1):
1124
+ gr.HTML("""
1125
+ <div class="workflow-card">
1126
+ <h3>📋 Export Formats</h3>
1127
+ <div class="feature-item">
1128
+ <h4>🤗 HuggingFace</h4>
1129
+ <p>Ready for transformers library</p>
1130
+ </div>
1131
+ <div class="feature-item">
1132
+ <h4>📄 JSON/JSONL</h4>
1133
+ <p>Universal format for any framework</p>
1134
+ </div>
1135
+ <div class="feature-item">
1136
+ <h4>📊 CSV</h4>
1137
+ <p>Easy analysis in Excel/Pandas</p>
1138
+ </div>
1139
+ </div>
1140
+ """)
1141
+
1142
+ # Event handlers
1143
+ def create_project(name, template):
1144
+ """Create new project"""
1145
+ if not name.strip():
1146
+ return "❌ Please enter a project name", {}
1147
+
1148
+ project = studio.start_new_project(name.strip(), template)
1149
+ status = f"""
1150
+ ✅ **Project Created Successfully!**
1151
+
1152
+ **Project:** {project['name']}
1153
+ **Type:** {template.replace('_', ' ').title()}
1154
+ **ID:** {project['id'][:8]}...
1155
+ **Created:** {project['created_at'][:19]}
1156
+
1157
+ 👉 **Next Step:** Go to the Data Collection tab to start scraping URLs
1158
+ """
1159
+ return status, project
1160
+
1161
+ def scrape_urls_handler(urls_text, urls_file, project, progress=gr.Progress()):
1162
+ """Handle URL scraping"""
1163
+ if not project:
1164
+ return "❌ Please create a project first", ""
1165
+
1166
+ # Process URLs from text input or file
1167
+ urls = []
1168
+ if urls_text:
1169
+ urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
1170
+ elif urls_file:
1171
+ # Handle file upload (simplified)
1172
+ try:
1173
+ content = urls_file.read().decode('utf-8')
1174
+ urls = [url.strip() for url in content.split('\n') if url.strip()]
1175
+ except:
1176
+ return "❌ Error reading uploaded file", ""
1177
+
1178
+ if not urls:
1179
+ return "❌ No URLs provided", ""
1180
+
1181
+ # Progress callback
1182
+ def progress_callback(pct, msg):
1183
+ progress(pct, desc=msg)
1184
+
1185
+ # Scrape URLs
1186
+ success_count, errors = studio.scrape_urls(urls, progress_callback)
1187
+
1188
+ if success_count > 0:
1189
+ stats_html = f"""
1190
+ <div class="stat-card">
1191
+ <h3>✅ Scraping Complete</h3>
1192
+ <p><strong>{success_count}</strong> items collected</p>
1193
+ <p><strong>{len(urls) - success_count}</strong> failed</p>
1194
+ </div>
1195
+ """
1196
 
1197
+ status = f"""
1198
+ **Scraping Complete!**
 
 
 
 
 
 
1199
 
1200
+ **Successfully scraped:** {success_count} URLs
1201
+ **Failed:** {len(urls) - success_count} URLs
 
 
 
1202
 
1203
+ 👉 **Next Step:** Go to Data Processing tab to clean and enhance your data
1204
+ """
1205
 
1206
+ return status, stats_html
1207
+ else:
1208
+ return f"❌ Scraping failed: {', '.join(errors)}", ""
1209
+
1210
+ def process_data_handler(clean_text, quality_filter, detect_language,
1211
+ add_sentiment, extract_entities, deduplicate, project):
1212
+ """Handle data processing"""
1213
+ if not project:
1214
+ return "❌ Please create a project first", ""
1215
+
1216
+ if not studio.scraped_items:
1217
+ return "❌ No scraped data to process. Please scrape URLs first.", ""
1218
+
1219
+ # Configure processing options
1220
+ options = {
1221
+ 'clean_text': clean_text,
1222
+ 'quality_filter': quality_filter,
1223
+ 'detect_language': detect_language,
1224
+ 'add_sentiment': add_sentiment,
1225
+ 'extract_entities': extract_entities,
1226
+ 'deduplicate': deduplicate
1227
+ }
1228
+
1229
+ # Process data
1230
+ processed_count = studio.process_data(options)
1231
+
1232
+ if processed_count > 0:
1233
+ stats = studio.get_data_statistics()
1234
+ stats_html = f"""
1235
+ <div class="stat-card">
1236
+ <h3>⚙️ Processing Complete</h3>
1237
+ <p><strong>{processed_count}</strong> items processed</p>
1238
+ <p>Avg Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
1239
+ <p>Avg Words: <strong>{stats.get('avg_word_count', 0)}</strong></p>
1240
+ </div>
1241
+ """
1242
+
1243
+ status = f"""
1244
+ ✅ **Processing Complete!**
1245
+
1246
+ **Processed items:** {processed_count}
1247
+ **Average quality score:** {stats.get('avg_quality_score', 0)}
1248
+ **Average word count:** {stats.get('avg_word_count', 0)}
1249
+
1250
+ 👉 **Next Step:** Check the Data Preview tab to review your dataset
1251
+ """
1252
+
1253
+ return status, stats_html
1254
+ else:
1255
+ return "❌ No items passed processing filters", ""
1256
+
1257
+ def refresh_preview_handler(project):
1258
+ """Refresh data preview"""
1259
+ if not project:
1260
+ return None, {}
1261
+
1262
+ preview_data = studio.get_data_preview()
1263
+ stats = studio.get_data_statistics()
1264
+
1265
+ if preview_data:
1266
+ # Convert to DataFrame format
1267
+ df_data = []
1268
+ for item in preview_data:
1269
+ df_data.append([
1270
+ item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
1271
+ item['content_preview'],
1272
+ item['word_count'],
1273
+ item['quality_score'],
1274
+ item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
1275
+ ])
1276
+
1277
+ return df_data, stats
1278
+
1279
+ return None, {}
1280
+
1281
+ def export_dataset_handler(export_format, export_template, project):
1282
+ """Handle dataset export"""
1283
+ if not project:
1284
+ return "❌ Please create a project first", None
1285
+
1286
+ if not studio.processed_items and not studio.scraped_items:
1287
+ return "❌ No data to export. Please scrape and process data first.", None
1288
+
1289
+ try:
1290
+ # Export dataset
1291
+ filename = studio.export_dataset(export_template, export_format)
1292
+
1293
+ status = f"""
1294
+ ✅ **Export Successful!**
1295
+
1296
+ **Format:** {export_format}
1297
+ **Template:** {export_template.replace('_', ' ').title()}
1298
+ **File:** {filename}
1299
+
1300
+ 📥 **Download your dataset using the link below**
1301
+ """
1302
+
1303
+ return status, filename
1304
+
1305
+ except Exception as e:
1306
+ return f"❌ Export failed: {str(e)}", None
1307
+
1308
+ # Connect event handlers
1309
+ create_project_btn.click(
1310
+ fn=create_project,
1311
+ inputs=[project_name, template_choice],
1312
+ outputs=[project_status, project_state]
1313
+ )
1314
 
 
1315
  scrape_btn.click(
1316
+ fn=scrape_urls_handler,
1317
+ inputs=[urls_input, urls_file, project_state],
1318
+ outputs=[scraping_status, collection_stats]
1319
+ )
1320
+
1321
+ process_btn.click(
1322
+ fn=process_data_handler,
1323
+ inputs=[clean_text, quality_filter, detect_language,
1324
+ add_sentiment, extract_entities, deduplicate, project_state],
1325
+ outputs=[processing_status, processing_stats]
1326
+ )
1327
+
1328
+ refresh_preview_btn.click(
1329
+ fn=refresh_preview_handler,
1330
+ inputs=[project_state],
1331
+ outputs=[data_preview, dataset_stats]
1332
  )
1333
 
1334
  export_btn.click(
1335
+ fn=export_dataset_handler,
1336
+ inputs=[export_format, export_template, project_state],
1337
+ outputs=[export_status, export_file]
1338
  )
1339
 
1340
+ # Auto-refresh preview when processing completes
1341
+ processing_status.change(
1342
+ fn=refresh_preview_handler,
1343
+ inputs=[project_state],
1344
+ outputs=[data_preview, dataset_stats]
1345
  )
1346
 
1347
  return interface
1348
 
1349
  # Launch the application
1350
  if __name__ == "__main__":
1351
+ logger.info("🚀 Starting AI Dataset Studio...")
1352
+
1353
+ # Check available features
1354
+ features = []
1355
+ if HAS_TRANSFORMERS:
1356
+ features.append("✅ AI Models")
1357
+ else:
1358
+ features.append("⚠️ Basic Processing")
1359
+
1360
+ if HAS_NLTK:
1361
+ features.append("✅ Advanced NLP")
1362
+ else:
1363
+ features.append("⚠️ Basic NLP")
1364
+
1365
+ if HAS_DATASETS:
1366
+ features.append("✅ HuggingFace Integration")
1367
+ else:
1368
+ features.append("⚠️ Standard Export Only")
1369
+
1370
+ logger.info(f"📊 Features: {' | '.join(features)}")
1371
+
1372
+ try:
1373
+ interface = create_modern_interface()
1374
+ logger.info("✅ Interface created successfully")
1375
+
1376
+ interface.launch(
1377
+ server_name="0.0.0.0",
1378
+ server_port=7860,
1379
+ share=False,
1380
+ show_error=True,
1381
+ debug=False
1382
+ )
1383
+
1384
+ except Exception as e:
1385
+ logger.error(f"❌ Failed to launch application: {e}")
1386
+ raise