MagicMeWizard commited on
Commit
e199fcf
Β·
verified Β·
1 Parent(s): 6d85bb5

Create app_minimal.py

Browse files
Files changed (1) hide show
  1. app_minimal.py +514 -0
app_minimal.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Dataset Studio - Minimal Version
3
+ Guaranteed to work with basic dependencies only
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import json
9
+ import re
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from urllib.parse import urlparse
13
+ from datetime import datetime
14
+ import logging
15
+ from typing import Dict, List, Tuple, Optional, Any
16
+ from dataclasses import dataclass, asdict
17
+ import uuid
18
+ import time
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ @dataclass
25
+ class SimpleScrapedItem:
26
+ """Simplified scraped content structure"""
27
+ id: str
28
+ url: str
29
+ title: str
30
+ content: str
31
+ word_count: int
32
+ scraped_at: str
33
+ quality_score: float = 0.0
34
+
35
+ class SimpleWebScraper:
36
+ """Simplified web scraper with basic functionality"""
37
+
38
+ def __init__(self):
39
+ self.session = requests.Session()
40
+ self.session.headers.update({
41
+ 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
43
+ })
44
+
45
+ def scrape_url(self, url: str) -> Optional[SimpleScrapedItem]:
46
+ """Scrape a single URL"""
47
+ try:
48
+ if not self._validate_url(url):
49
+ return None
50
+
51
+ response = self.session.get(url, timeout=10)
52
+ response.raise_for_status()
53
+
54
+ soup = BeautifulSoup(response.content, 'html.parser')
55
+
56
+ # Extract title
57
+ title_tag = soup.find('title')
58
+ title = title_tag.get_text().strip() if title_tag else "Untitled"
59
+
60
+ # Extract content
61
+ # Remove unwanted elements
62
+ for element in soup(['script', 'style', 'nav', 'header', 'footer']):
63
+ element.decompose()
64
+
65
+ # Try to find main content
66
+ content_element = (soup.find('article') or
67
+ soup.find('main') or
68
+ soup.find(class_='content') or
69
+ soup.find('body'))
70
+
71
+ if content_element:
72
+ content = content_element.get_text(separator=' ', strip=True)
73
+ else:
74
+ content = soup.get_text(separator=' ', strip=True)
75
+
76
+ # Clean content
77
+ content = re.sub(r'\s+', ' ', content).strip()
78
+
79
+ # Calculate basic metrics
80
+ word_count = len(content.split())
81
+ quality_score = min(1.0, word_count / 100) if word_count > 0 else 0.0
82
+
83
+ return SimpleScrapedItem(
84
+ id=str(uuid.uuid4()),
85
+ url=url,
86
+ title=title,
87
+ content=content,
88
+ word_count=word_count,
89
+ scraped_at=datetime.now().isoformat(),
90
+ quality_score=quality_score
91
+ )
92
+
93
+ except Exception as e:
94
+ logger.error(f"Failed to scrape {url}: {e}")
95
+ return None
96
+
97
+ def _validate_url(self, url: str) -> bool:
98
+ """Basic URL validation"""
99
+ try:
100
+ parsed = urlparse(url)
101
+ return parsed.scheme in ['http', 'https'] and parsed.netloc
102
+ except:
103
+ return False
104
+
105
+ def batch_scrape(self, urls: List[str], progress_callback=None) -> List[SimpleScrapedItem]:
106
+ """Scrape multiple URLs"""
107
+ results = []
108
+ total = len(urls)
109
+
110
+ for i, url in enumerate(urls):
111
+ if progress_callback:
112
+ progress_callback((i + 1) / total, f"Scraping {i+1}/{total}")
113
+
114
+ item = self.scrape_url(url)
115
+ if item:
116
+ results.append(item)
117
+
118
+ time.sleep(1) # Rate limiting
119
+
120
+ return results
121
+
122
+ class SimpleDataProcessor:
123
+ """Basic data processing"""
124
+
125
+ def process_items(self, items: List[SimpleScrapedItem], options: Dict[str, bool]) -> List[SimpleScrapedItem]:
126
+ """Process scraped items"""
127
+ processed = []
128
+
129
+ for item in items:
130
+ # Apply quality filter
131
+ if options.get('quality_filter', True) and item.quality_score < 0.3:
132
+ continue
133
+
134
+ # Clean text if requested
135
+ if options.get('clean_text', True):
136
+ item.content = self._clean_text(item.content)
137
+
138
+ processed.append(item)
139
+
140
+ return processed
141
+
142
+ def _clean_text(self, text: str) -> str:
143
+ """Basic text cleaning"""
144
+ # Remove URLs
145
+ text = re.sub(r'http\S+', '', text)
146
+ # Remove extra whitespace
147
+ text = re.sub(r'\s+', ' ', text)
148
+ # Remove common navigation text
149
+ text = re.sub(r'(Click here|Read more|Subscribe|Advertisement)', '', text, flags=re.IGNORECASE)
150
+ return text.strip()
151
+
152
+ class SimpleExporter:
153
+ """Basic export functionality"""
154
+
155
+ def export_dataset(self, items: List[SimpleScrapedItem], format_type: str) -> str:
156
+ """Export dataset"""
157
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
158
+
159
+ if format_type == "json":
160
+ filename = f"dataset_{timestamp}.json"
161
+ data = [asdict(item) for item in items]
162
+ with open(filename, 'w', encoding='utf-8') as f:
163
+ json.dump(data, f, indent=2, ensure_ascii=False)
164
+ return filename
165
+
166
+ elif format_type == "csv":
167
+ filename = f"dataset_{timestamp}.csv"
168
+ data = [asdict(item) for item in items]
169
+ df = pd.DataFrame(data)
170
+ df.to_csv(filename, index=False)
171
+ return filename
172
+
173
+ else:
174
+ raise ValueError(f"Unsupported format: {format_type}")
175
+
176
+ class SimpleDatasetStudio:
177
+ """Simplified main application"""
178
+
179
+ def __init__(self):
180
+ self.scraper = SimpleWebScraper()
181
+ self.processor = SimpleDataProcessor()
182
+ self.exporter = SimpleExporter()
183
+
184
+ self.scraped_items = []
185
+ self.processed_items = []
186
+ self.current_project = None
187
+
188
+ def create_project(self, name: str) -> Dict[str, Any]:
189
+ """Create a new project"""
190
+ self.current_project = {
191
+ 'name': name,
192
+ 'id': str(uuid.uuid4()),
193
+ 'created_at': datetime.now().isoformat()
194
+ }
195
+ self.scraped_items = []
196
+ self.processed_items = []
197
+ return self.current_project
198
+
199
+ def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
200
+ """Scrape URLs"""
201
+ url_list = [url.strip() for url in urls if url.strip()]
202
+ if not url_list:
203
+ return 0, ["No valid URLs provided"]
204
+
205
+ self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)
206
+ success_count = len(self.scraped_items)
207
+ failed_count = len(url_list) - success_count
208
+
209
+ errors = []
210
+ if failed_count > 0:
211
+ errors.append(f"{failed_count} URLs failed")
212
+
213
+ return success_count, errors
214
+
215
+ def process_data(self, options: Dict[str, bool]) -> int:
216
+ """Process scraped data"""
217
+ if not self.scraped_items:
218
+ return 0
219
+
220
+ self.processed_items = self.processor.process_items(self.scraped_items, options)
221
+ return len(self.processed_items)
222
+
223
+ def get_preview(self) -> List[Dict[str, Any]]:
224
+ """Get data preview"""
225
+ items = self.processed_items or self.scraped_items
226
+ preview = []
227
+
228
+ for item in items[:5]:
229
+ preview.append({
230
+ 'Title': item.title[:50] + "..." if len(item.title) > 50 else item.title,
231
+ 'Content Preview': item.content[:100] + "..." if len(item.content) > 100 else item.content,
232
+ 'Word Count': item.word_count,
233
+ 'Quality Score': round(item.quality_score, 2),
234
+ 'URL': item.url[:50] + "..." if len(item.url) > 50 else item.url
235
+ })
236
+
237
+ return preview
238
+
239
+ def get_stats(self) -> Dict[str, Any]:
240
+ """Get dataset statistics"""
241
+ items = self.processed_items or self.scraped_items
242
+ if not items:
243
+ return {}
244
+
245
+ word_counts = [item.word_count for item in items]
246
+ quality_scores = [item.quality_score for item in items]
247
+
248
+ return {
249
+ 'total_items': len(items),
250
+ 'avg_word_count': round(sum(word_counts) / len(word_counts)),
251
+ 'avg_quality': round(sum(quality_scores) / len(quality_scores), 2),
252
+ 'min_words': min(word_counts),
253
+ 'max_words': max(word_counts)
254
+ }
255
+
256
+ def export_data(self, format_type: str) -> str:
257
+ """Export dataset"""
258
+ items = self.processed_items or self.scraped_items
259
+ if not items:
260
+ raise ValueError("No data to export")
261
+
262
+ return self.exporter.export_dataset(items, format_type)
263
+
264
+ def create_simple_interface():
265
+ """Create simplified Gradio interface"""
266
+
267
+ studio = SimpleDatasetStudio()
268
+
269
+ # Custom CSS
270
+ css = """
271
+ .container { max-width: 1200px; margin: auto; }
272
+ .header {
273
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
274
+ color: white; padding: 2rem; border-radius: 10px;
275
+ text-align: center; margin-bottom: 2rem;
276
+ }
277
+ .step-box {
278
+ background: #f8f9ff; border: 1px solid #e1e5ff;
279
+ border-radius: 8px; padding: 1.5rem; margin: 1rem 0;
280
+ }
281
+ """
282
+
283
+ with gr.Blocks(css=css, title="AI Dataset Studio - Simple") as interface:
284
+
285
+ # Header
286
+ gr.HTML("""
287
+ <div class="header">
288
+ <h1>πŸš€ AI Dataset Studio - Simple Version</h1>
289
+ <p>Create datasets from web content - No complex setup required!</p>
290
+ </div>
291
+ """)
292
+
293
+ # Project state
294
+ project_state = gr.State({})
295
+
296
+ with gr.Tabs():
297
+
298
+ # Project Setup
299
+ with gr.Tab("πŸ“‹ Project Setup"):
300
+ gr.HTML('<div class="step-box"><h3>Step 1: Create Your Project</h3></div>')
301
+
302
+ project_name = gr.Textbox(
303
+ label="Project Name",
304
+ placeholder="e.g., News Articles Dataset",
305
+ value="My Dataset"
306
+ )
307
+
308
+ create_btn = gr.Button("Create Project", variant="primary")
309
+ project_status = gr.Markdown("")
310
+
311
+ def create_project_handler(name):
312
+ if not name.strip():
313
+ return "❌ Please enter a project name", {}
314
+
315
+ project = studio.create_project(name.strip())
316
+ status = f"""
317
+ βœ… **Project Created!**
318
+
319
+ **Name:** {project['name']}
320
+ **ID:** {project['id'][:8]}...
321
+ **Created:** {project['created_at'][:19]}
322
+
323
+ πŸ‘‰ Next: Go to Data Collection tab
324
+ """
325
+ return status, project
326
+
327
+ create_btn.click(
328
+ fn=create_project_handler,
329
+ inputs=[project_name],
330
+ outputs=[project_status, project_state]
331
+ )
332
+
333
+ # Data Collection
334
+ with gr.Tab("πŸ•·οΈ Data Collection"):
335
+ gr.HTML('<div class="step-box"><h3>Step 2: Scrape Web Content</h3></div>')
336
+
337
+ urls_input = gr.Textbox(
338
+ label="URLs to Scrape (one per line)",
339
+ placeholder="https://example.com/article1\nhttps://example.com/article2",
340
+ lines=6
341
+ )
342
+
343
+ scrape_btn = gr.Button("Start Scraping", variant="primary")
344
+ scrape_status = gr.Markdown("")
345
+
346
+ def scrape_handler(urls_text, project, progress=gr.Progress()):
347
+ if not project:
348
+ return "❌ Create a project first"
349
+
350
+ urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
351
+ if not urls:
352
+ return "❌ No URLs provided"
353
+
354
+ def progress_callback(pct, msg):
355
+ progress(pct, desc=msg)
356
+
357
+ success_count, errors = studio.scrape_urls(urls, progress_callback)
358
+
359
+ if success_count > 0:
360
+ return f"""
361
+ βœ… **Scraping Complete!**
362
+
363
+ **Success:** {success_count} URLs
364
+ **Failed:** {len(urls) - success_count} URLs
365
+
366
+ πŸ‘‰ Next: Go to Data Processing tab
367
+ """
368
+ else:
369
+ return f"❌ Scraping failed: {', '.join(errors)}"
370
+
371
+ scrape_btn.click(
372
+ fn=scrape_handler,
373
+ inputs=[urls_input, project_state],
374
+ outputs=[scrape_status]
375
+ )
376
+
377
+ # Data Processing
378
+ with gr.Tab("βš™οΈ Data Processing"):
379
+ gr.HTML('<div class="step-box"><h3>Step 3: Clean and Process Data</h3></div>')
380
+
381
+ with gr.Row():
382
+ clean_text = gr.Checkbox(label="Clean Text", value=True)
383
+ quality_filter = gr.Checkbox(label="Quality Filter", value=True)
384
+
385
+ process_btn = gr.Button("Process Data", variant="primary")
386
+ process_status = gr.Markdown("")
387
+
388
+ def process_handler(clean, quality, project):
389
+ if not project:
390
+ return "❌ Create a project first"
391
+
392
+ options = {
393
+ 'clean_text': clean,
394
+ 'quality_filter': quality
395
+ }
396
+
397
+ processed_count = studio.process_data(options)
398
+
399
+ if processed_count > 0:
400
+ return f"""
401
+ βœ… **Processing Complete!**
402
+
403
+ **Processed:** {processed_count} items
404
+
405
+ πŸ‘‰ Next: Check Data Preview tab
406
+ """
407
+ else:
408
+ return "❌ No items passed processing filters"
409
+
410
+ process_btn.click(
411
+ fn=process_handler,
412
+ inputs=[clean_text, quality_filter, project_state],
413
+ outputs=[process_status]
414
+ )
415
+
416
+ # Data Preview
417
+ with gr.Tab("πŸ‘€ Data Preview"):
418
+ gr.HTML('<div class="step-box"><h3>Step 4: Review Your Dataset</h3></div>')
419
+
420
+ refresh_btn = gr.Button("Refresh Preview")
421
+ preview_table = gr.DataFrame(label="Dataset Preview")
422
+ stats_display = gr.JSON(label="Statistics")
423
+
424
+ def refresh_handler(project):
425
+ if not project:
426
+ return None, {}
427
+
428
+ preview = studio.get_preview()
429
+ stats = studio.get_stats()
430
+ return preview, stats
431
+
432
+ refresh_btn.click(
433
+ fn=refresh_handler,
434
+ inputs=[project_state],
435
+ outputs=[preview_table, stats_display]
436
+ )
437
+
438
+ # Export
439
+ with gr.Tab("πŸ“€ Export Dataset"):
440
+ gr.HTML('<div class="step-box"><h3>Step 5: Export Your Dataset</h3></div>')
441
+
442
+ export_format = gr.Radio(
443
+ choices=["JSON", "CSV"],
444
+ label="Export Format",
445
+ value="JSON"
446
+ )
447
+
448
+ export_btn = gr.Button("Export Dataset", variant="primary")
449
+ export_status = gr.Markdown("")
450
+ export_file = gr.File(label="Download", visible=False)
451
+
452
+ def export_handler(format_type, project):
453
+ if not project:
454
+ return "❌ Create a project first", None
455
+
456
+ try:
457
+ filename = studio.export_data(format_type.lower())
458
+ return f"βœ… Export successful! File: {filename}", filename
459
+ except Exception as e:
460
+ return f"❌ Export failed: {str(e)}", None
461
+
462
+ export_btn.click(
463
+ fn=export_handler,
464
+ inputs=[export_format, project_state],
465
+ outputs=[export_status, export_file]
466
+ )
467
+
468
+ # Instructions
469
+ with gr.Accordion("πŸ“š Quick Guide", open=False):
470
+ gr.Markdown("""
471
+ ## How to Use
472
+
473
+ 1. **Create Project** - Give your dataset a name
474
+ 2. **Add URLs** - Paste URLs of web pages to scrape
475
+ 3. **Process Data** - Clean and filter the content
476
+ 4. **Review** - Check the quality of your dataset
477
+ 5. **Export** - Download in JSON or CSV format
478
+
479
+ ## Features
480
+ - βœ… Smart content extraction
481
+ - βœ… Quality filtering
482
+ - βœ… Text cleaning
483
+ - βœ… JSON/CSV export
484
+ - βœ… Preview and statistics
485
+
486
+ ## Tips
487
+ - Use high-quality source URLs
488
+ - Enable quality filtering for better results
489
+ - Review your data before exporting
490
+ - Start with 5-10 URLs to test
491
+ """)
492
+
493
+ return interface
494
+
495
+ # Launch application
496
+ if __name__ == "__main__":
497
+ logger.info("πŸš€ Starting AI Dataset Studio (Simple Version)")
498
+
499
+ try:
500
+ interface = create_simple_interface()
501
+ logger.info("βœ… Simple interface created successfully")
502
+
503
+ interface.launch(
504
+ server_name="0.0.0.0",
505
+ server_port=7860,
506
+ share=False,
507
+ show_error=True
508
+ )
509
+
510
+ except Exception as e:
511
+ logger.error(f"❌ Failed to launch: {e}")
512
+ print("\nπŸ’‘ If you see import errors, try installing:")
513
+ print("pip install gradio pandas requests beautifulsoup4")
514
+ raise