MagicMeWizard commited on
Commit
35f9333
Β·
verified Β·
1 Parent(s): 399a018

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +701 -0
app.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI-Powered Web Scraper - app.py
3
+ Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces
4
+ """
5
+
6
+ import gradio as gr
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from urllib.parse import urljoin, urlparse
10
+ import pandas as pd
11
+ from datetime import datetime
12
+ import json
13
+ import re
14
+ import time
15
+ from typing import List, Dict, Optional, Tuple
16
+ import logging
17
+ from pathlib import Path
18
+ import os
19
+ from dataclasses import dataclass
20
+ from transformers import pipeline
21
+ import nltk
22
+ from nltk.tokenize import sent_tokenize
23
+ import asyncio
24
+ import aiohttp
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ import hashlib
27
+
28
+ # Download required NLTK data
29
+ try:
30
+ nltk.data.find('tokenizers/punkt')
31
+ except LookupError:
32
+ nltk.download('punkt', quiet=True)
33
+
34
+ # Configure logging
35
+ logging.basicConfig(level=logging.INFO)
36
+ logger = logging.getLogger(__name__)
37
+
38
+ @dataclass
39
+ class ScrapedContent:
40
+ """Data class for scraped content with metadata"""
41
+ url: str
42
+ title: str
43
+ content: str
44
+ summary: str
45
+ word_count: int
46
+ reading_time: int
47
+ extracted_at: str
48
+ author: Optional[str] = None
49
+ publish_date: Optional[str] = None
50
+ meta_description: Optional[str] = None
51
+ keywords: List[str] = None
52
+
53
+ class SecurityValidator:
54
+ """Security validation for URLs and content"""
55
+
56
+ ALLOWED_SCHEMES = {'http', 'https'}
57
+ BLOCKED_DOMAINS = {
58
+ 'localhost', '127.0.0.1', '0.0.0.0',
59
+ '192.168.', '10.', '172.16.', '172.17.',
60
+ '172.18.', '172.19.', '172.20.', '172.21.',
61
+ '172.22.', '172.23.', '172.24.', '172.25.',
62
+ '172.26.', '172.27.', '172.28.', '172.29.',
63
+ '172.30.', '172.31.'
64
+ }
65
+
66
+ @classmethod
67
+ def validate_url(cls, url: str) -> Tuple[bool, str]:
68
+ """Validate URL for security concerns"""
69
+ try:
70
+ parsed = urlparse(url)
71
+
72
+ # Check scheme
73
+ if parsed.scheme not in cls.ALLOWED_SCHEMES:
74
+ return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed."
75
+
76
+ # Check for blocked domains
77
+ hostname = parsed.hostname or ''
78
+ if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
79
+ return False, "Access to internal/local networks is not allowed."
80
+
81
+ # Basic malformed URL check
82
+ if not parsed.netloc:
83
+ return False, "Invalid URL format."
84
+
85
+ return True, "URL is valid."
86
+
87
+ except Exception as e:
88
+ return False, f"URL validation error: {str(e)}"
89
+
90
+ class RobotsTxtChecker:
91
+ """Check robots.txt compliance"""
92
+
93
+ @staticmethod
94
+ def can_fetch(url: str, user_agent: str = "*") -> bool:
95
+ """Check if URL can be fetched according to robots.txt"""
96
+ try:
97
+ parsed_url = urlparse(url)
98
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
99
+
100
+ response = requests.get(robots_url, timeout=5)
101
+ if response.status_code == 200:
102
+ # Simple robots.txt parsing (basic implementation)
103
+ lines = response.text.split('\n')
104
+ user_agent_section = False
105
+
106
+ for line in lines:
107
+ line = line.strip()
108
+ if line.startswith('User-agent:'):
109
+ agent = line.split(':', 1)[1].strip()
110
+ user_agent_section = agent == '*' or agent.lower() == user_agent.lower()
111
+ elif user_agent_section and line.startswith('Disallow:'):
112
+ disallowed = line.split(':', 1)[1].strip()
113
+ if disallowed and url.endswith(disallowed):
114
+ return False
115
+
116
+ return True
117
+
118
+ except Exception:
119
+ # If robots.txt can't be fetched, assume allowed
120
+ return True
121
+
122
+ class ContentExtractor:
123
+ """Advanced content extraction with multiple strategies"""
124
+
125
+ def __init__(self):
126
+ self.session = requests.Session()
127
+ self.session.headers.update({
128
+ 'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)',
129
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130
+ 'Accept-Language': 'en-US,en;q=0.5',
131
+ 'Accept-Encoding': 'gzip, deflate',
132
+ 'Connection': 'keep-alive',
133
+ 'Upgrade-Insecure-Requests': '1',
134
+ })
135
+
136
+ def extract_content(self, url: str) -> Optional[ScrapedContent]:
137
+ """Extract content from URL with robust error handling"""
138
+ try:
139
+ # Security validation
140
+ is_valid, validation_msg = SecurityValidator.validate_url(url)
141
+ if not is_valid:
142
+ raise ValueError(f"Security validation failed: {validation_msg}")
143
+
144
+ # Check robots.txt
145
+ if not RobotsTxtChecker.can_fetch(url):
146
+ raise ValueError("robots.txt disallows scraping this URL")
147
+
148
+ # Fetch content
149
+ response = self.session.get(url, timeout=15)
150
+ response.raise_for_status()
151
+
152
+ # Parse HTML
153
+ soup = BeautifulSoup(response.content, 'html.parser')
154
+
155
+ # Extract metadata
156
+ title = self._extract_title(soup)
157
+ author = self._extract_author(soup)
158
+ publish_date = self._extract_publish_date(soup)
159
+ meta_description = self._extract_meta_description(soup)
160
+
161
+ # Extract main content
162
+ content = self._extract_main_content(soup)
163
+
164
+ if not content or len(content.strip()) < 100:
165
+ raise ValueError("Insufficient content extracted")
166
+
167
+ # Calculate metrics
168
+ word_count = len(content.split())
169
+ reading_time = max(1, word_count // 200) # Average reading speed
170
+
171
+ # Extract keywords
172
+ keywords = self._extract_keywords(content)
173
+
174
+ return ScrapedContent(
175
+ url=url,
176
+ title=title,
177
+ content=content,
178
+ summary="", # Will be filled by AI summarizer
179
+ word_count=word_count,
180
+ reading_time=reading_time,
181
+ extracted_at=datetime.now().isoformat(),
182
+ author=author,
183
+ publish_date=publish_date,
184
+ meta_description=meta_description,
185
+ keywords=keywords
186
+ )
187
+
188
+ except Exception as e:
189
+ logger.error(f"Content extraction failed for {url}: {str(e)}")
190
+ raise
191
+
192
+ def _extract_title(self, soup: BeautifulSoup) -> str:
193
+ """Extract page title with fallbacks"""
194
+ # Try meta og:title first
195
+ og_title = soup.find('meta', property='og:title')
196
+ if og_title and og_title.get('content'):
197
+ return og_title['content'].strip()
198
+
199
+ # Try regular title tag
200
+ title_tag = soup.find('title')
201
+ if title_tag:
202
+ return title_tag.get_text().strip()
203
+
204
+ # Try h1 as fallback
205
+ h1_tag = soup.find('h1')
206
+ if h1_tag:
207
+ return h1_tag.get_text().strip()
208
+
209
+ return "No title found"
210
+
211
+ def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
212
+ """Extract author information"""
213
+ # Try multiple selectors for author
214
+ author_selectors = [
215
+ 'meta[name="author"]',
216
+ 'meta[property="article:author"]',
217
+ '.author',
218
+ '.byline',
219
+ '[rel="author"]'
220
+ ]
221
+
222
+ for selector in author_selectors:
223
+ element = soup.select_one(selector)
224
+ if element:
225
+ if element.name == 'meta':
226
+ return element.get('content', '').strip()
227
+ else:
228
+ return element.get_text().strip()
229
+
230
+ return None
231
+
232
+ def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
233
+ """Extract publication date"""
234
+ date_selectors = [
235
+ 'meta[property="article:published_time"]',
236
+ 'meta[name="publishdate"]',
237
+ 'time[datetime]',
238
+ '.publish-date',
239
+ '.date'
240
+ ]
241
+
242
+ for selector in date_selectors:
243
+ element = soup.select_one(selector)
244
+ if element:
245
+ if element.name == 'meta':
246
+ return element.get('content', '').strip()
247
+ elif element.name == 'time':
248
+ return element.get('datetime', '').strip()
249
+ else:
250
+ return element.get_text().strip()
251
+
252
+ return None
253
+
254
+ def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
255
+ """Extract meta description"""
256
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
257
+ if meta_desc:
258
+ return meta_desc.get('content', '').strip()
259
+
260
+ og_desc = soup.find('meta', property='og:description')
261
+ if og_desc:
262
+ return og_desc.get('content', '').strip()
263
+
264
+ return None
265
+
266
+ def _extract_main_content(self, soup: BeautifulSoup) -> str:
267
+ """Extract main content with multiple strategies"""
268
+ # Remove unwanted elements
269
+ for element in soup(['script', 'style', 'nav', 'header', 'footer',
270
+ 'aside', 'advertisement', '.ads', '.sidebar']):
271
+ element.decompose()
272
+
273
+ # Try content-specific selectors first
274
+ content_selectors = [
275
+ 'article',
276
+ 'main',
277
+ '.content',
278
+ '.post-content',
279
+ '.entry-content',
280
+ '.article-body',
281
+ '#content',
282
+ '.story-body'
283
+ ]
284
+
285
+ for selector in content_selectors:
286
+ element = soup.select_one(selector)
287
+ if element:
288
+ text = element.get_text(separator=' ', strip=True)
289
+ if len(text) > 200: # Minimum content threshold
290
+ return self._clean_text(text)
291
+
292
+ # Fallback: extract from body
293
+ body = soup.find('body')
294
+ if body:
295
+ text = body.get_text(separator=' ', strip=True)
296
+ return self._clean_text(text)
297
+
298
+ # Last resort: all text
299
+ return self._clean_text(soup.get_text(separator=' ', strip=True))
300
+
301
+ def _clean_text(self, text: str) -> str:
302
+ """Clean extracted text"""
303
+ # Remove extra whitespace
304
+ text = re.sub(r'\s+', ' ', text)
305
+
306
+ # Remove common unwanted patterns
307
+ text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
308
+ text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
309
+ text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE)
310
+
311
+ return text.strip()
312
+
313
+ def _extract_keywords(self, content: str) -> List[str]:
314
+ """Extract basic keywords from content"""
315
+ # Simple keyword extraction (can be enhanced with NLP)
316
+ words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower())
317
+ word_freq = {}
318
+
319
+ for word in words:
320
+ if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']:
321
+ word_freq[word] = word_freq.get(word, 0) + 1
322
+
323
+ # Return top 10 keywords
324
+ sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
325
+ return [word for word, freq in sorted_words[:10]]
326
+
327
+ class AISummarizer:
328
+ """AI-powered content summarization"""
329
+
330
+ def __init__(self):
331
+ self.summarizer = None
332
+ self._load_model()
333
+
334
+ def _load_model(self):
335
+ """Load summarization model with error handling"""
336
+ try:
337
+ self.summarizer = pipeline(
338
+ "summarization",
339
+ model="facebook/bart-large-cnn",
340
+ tokenizer="facebook/bart-large-cnn"
341
+ )
342
+ logger.info("Summarization model loaded successfully")
343
+ except Exception as e:
344
+ logger.error(f"Failed to load summarization model: {e}")
345
+ # Fallback to a smaller model
346
+ try:
347
+ self.summarizer = pipeline(
348
+ "summarization",
349
+ model="sshleifer/distilbart-cnn-12-6"
350
+ )
351
+ logger.info("Fallback summarization model loaded")
352
+ except Exception as e2:
353
+ logger.error(f"Failed to load fallback model: {e2}")
354
+ self.summarizer = None
355
+
356
+ def summarize(self, content: str, max_length: int = 300) -> str:
357
+ """Generate AI summary of content"""
358
+ if not self.summarizer:
359
+ return self._extractive_summary(content)
360
+
361
+ try:
362
+ # Split content into chunks if too long
363
+ max_input_length = 1024
364
+ chunks = self._split_content(content, max_input_length)
365
+
366
+ summaries = []
367
+ for chunk in chunks:
368
+ if len(chunk.split()) < 20: # Skip very short chunks
369
+ continue
370
+
371
+ result = self.summarizer(
372
+ chunk,
373
+ max_length=min(max_length, len(chunk.split()) // 2),
374
+ min_length=30,
375
+ do_sample=False
376
+ )
377
+ summaries.append(result[0]['summary_text'])
378
+
379
+ # Combine summaries
380
+ combined = ' '.join(summaries)
381
+
382
+ # If still too long, summarize again
383
+ if len(combined.split()) > max_length:
384
+ result = self.summarizer(
385
+ combined,
386
+ max_length=max_length,
387
+ min_length=50,
388
+ do_sample=False
389
+ )
390
+ return result[0]['summary_text']
391
+
392
+ return combined
393
+
394
+ except Exception as e:
395
+ logger.error(f"AI summarization failed: {e}")
396
+ return self._extractive_summary(content)
397
+
398
+ def _split_content(self, content: str, max_length: int) -> List[str]:
399
+ """Split content into manageable chunks"""
400
+ sentences = sent_tokenize(content)
401
+ chunks = []
402
+ current_chunk = []
403
+ current_length = 0
404
+
405
+ for sentence in sentences:
406
+ sentence_length = len(sentence.split())
407
+ if current_length + sentence_length > max_length and current_chunk:
408
+ chunks.append(' '.join(current_chunk))
409
+ current_chunk = [sentence]
410
+ current_length = sentence_length
411
+ else:
412
+ current_chunk.append(sentence)
413
+ current_length += sentence_length
414
+
415
+ if current_chunk:
416
+ chunks.append(' '.join(current_chunk))
417
+
418
+ return chunks
419
+
420
+ def _extractive_summary(self, content: str) -> str:
421
+ """Fallback extractive summarization"""
422
+ sentences = sent_tokenize(content)
423
+ if len(sentences) <= 3:
424
+ return content
425
+
426
+ # Simple extractive approach: take first, middle, and last sentences
427
+ summary_sentences = [
428
+ sentences[0],
429
+ sentences[len(sentences) // 2],
430
+ sentences[-1]
431
+ ]
432
+
433
+ return ' '.join(summary_sentences)
434
+
435
+ class WebScraperApp:
436
+ """Main application class"""
437
+
438
+ def __init__(self):
439
+ self.extractor = ContentExtractor()
440
+ self.summarizer = AISummarizer()
441
+ self.scraped_data = []
442
+
443
+ def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]:
444
+ """Process a single URL and return results"""
445
+ try:
446
+ if not url.strip():
447
+ return "❌ Error", "Please enter a valid URL", "", ""
448
+
449
+ # Add protocol if missing
450
+ if not url.startswith(('http://', 'https://')):
451
+ url = 'https://' + url
452
+
453
+ # Extract content
454
+ with gr.update(): # Show progress
455
+ scraped_content = self.extractor.extract_content(url)
456
+
457
+ # Generate summary
458
+ summary = self.summarizer.summarize(scraped_content.content, summary_length)
459
+ scraped_content.summary = summary
460
+
461
+ # Store result
462
+ self.scraped_data.append(scraped_content)
463
+
464
+ # Format results
465
+ metadata = f"""
466
+ **πŸ“Š Content Analysis**
467
+ - **Title:** {scraped_content.title}
468
+ - **Author:** {scraped_content.author or 'Not found'}
469
+ - **Published:** {scraped_content.publish_date or 'Not found'}
470
+ - **Word Count:** {scraped_content.word_count:,}
471
+ - **Reading Time:** {scraped_content.reading_time} minutes
472
+ - **Extracted:** {scraped_content.extracted_at}
473
+ """
474
+
475
+ keywords_text = f"**🏷️ Keywords:** {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else ""
476
+
477
+ return (
478
+ "βœ… Success",
479
+ metadata,
480
+ f"**πŸ“ AI Summary ({len(summary.split())} words):**\n\n{summary}",
481
+ keywords_text
482
+ )
483
+
484
+ except Exception as e:
485
+ error_msg = f"Failed to process URL: {str(e)}"
486
+ logger.error(error_msg)
487
+ return "❌ Error", error_msg, "", ""
488
+
489
+ def export_data(self, format_type: str) -> str:
490
+ """Export scraped data to file"""
491
+ if not self.scraped_data:
492
+ return "No data to export"
493
+
494
+ try:
495
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
496
+
497
+ if format_type == "CSV":
498
+ filename = f"scraped_data_{timestamp}.csv"
499
+ df = pd.DataFrame([
500
+ {
501
+ 'URL': item.url,
502
+ 'Title': item.title,
503
+ 'Author': item.author,
504
+ 'Published': item.publish_date,
505
+ 'Word Count': item.word_count,
506
+ 'Reading Time': item.reading_time,
507
+ 'Summary': item.summary,
508
+ 'Keywords': ', '.join(item.keywords) if item.keywords else '',
509
+ 'Extracted At': item.extracted_at
510
+ }
511
+ for item in self.scraped_data
512
+ ])
513
+ df.to_csv(filename, index=False)
514
+
515
+ elif format_type == "JSON":
516
+ filename = f"scraped_data_{timestamp}.json"
517
+ data = [
518
+ {
519
+ 'url': item.url,
520
+ 'title': item.title,
521
+ 'content': item.content,
522
+ 'summary': item.summary,
523
+ 'metadata': {
524
+ 'author': item.author,
525
+ 'publish_date': item.publish_date,
526
+ 'word_count': item.word_count,
527
+ 'reading_time': item.reading_time,
528
+ 'keywords': item.keywords,
529
+ 'extracted_at': item.extracted_at
530
+ }
531
+ }
532
+ for item in self.scraped_data
533
+ ]
534
+ with open(filename, 'w', encoding='utf-8') as f:
535
+ json.dump(data, f, indent=2, ensure_ascii=False)
536
+
537
+ return filename
538
+
539
+ except Exception as e:
540
+ logger.error(f"Export failed: {e}")
541
+ return f"Export failed: {str(e)}"
542
+
543
+ def clear_data(self) -> str:
544
+ """Clear all scraped data"""
545
+ self.scraped_data.clear()
546
+ return "Data cleared successfully"
547
+
548
+ def create_interface():
549
+ """Create the Gradio interface"""
550
+ app = WebScraperApp()
551
+
552
+ # Custom CSS for professional appearance
553
+ custom_css = """
554
+ .gradio-container {
555
+ max-width: 1200px;
556
+ margin: auto;
557
+ }
558
+ .main-header {
559
+ text-align: center;
560
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
561
+ color: white;
562
+ padding: 2rem;
563
+ border-radius: 10px;
564
+ margin-bottom: 2rem;
565
+ }
566
+ .feature-box {
567
+ background: #f8f9fa;
568
+ border: 1px solid #e9ecef;
569
+ border-radius: 8px;
570
+ padding: 1.5rem;
571
+ margin: 1rem 0;
572
+ }
573
+ .status-success {
574
+ color: #28a745;
575
+ font-weight: bold;
576
+ }
577
+ .status-error {
578
+ color: #dc3545;
579
+ font-weight: bold;
580
+ }
581
+ """
582
+
583
+ with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface:
584
+
585
+ # Header
586
+ gr.HTML("""
587
+ <div class="main-header">
588
+ <h1>πŸ€– AI-Powered Web Scraper</h1>
589
+ <p>Professional content extraction and summarization for journalists, analysts, and researchers</p>
590
+ </div>
591
+ """)
592
+
593
+ # Main interface
594
+ with gr.Row():
595
+ with gr.Column(scale=2):
596
+ # Input section
597
+ gr.HTML("<div class='feature-box'><h3>πŸ“‘ Content Extraction</h3></div>")
598
+
599
+ url_input = gr.Textbox(
600
+ label="Enter URL to scrape",
601
+ placeholder="https://example.com/article",
602
+ lines=1
603
+ )
604
+
605
+ with gr.Row():
606
+ summary_length = gr.Slider(
607
+ minimum=100,
608
+ maximum=500,
609
+ value=300,
610
+ step=50,
611
+ label="Summary Length (words)"
612
+ )
613
+
614
+ scrape_btn = gr.Button("πŸš€ Extract & Summarize", variant="primary", size="lg")
615
+
616
+ # Results section
617
+ gr.HTML("<div class='feature-box'><h3>πŸ“Š Results</h3></div>")
618
+
619
+ status_output = gr.Textbox(label="Status", lines=1, interactive=False)
620
+ metadata_output = gr.Markdown(label="Metadata")
621
+ summary_output = gr.Markdown(label="AI Summary")
622
+ keywords_output = gr.Markdown(label="Keywords")
623
+
624
+ with gr.Column(scale=1):
625
+ # Export section
626
+ gr.HTML("<div class='feature-box'><h3>πŸ’Ύ Export Options</h3></div>")
627
+
628
+ export_format = gr.Radio(
629
+ choices=["CSV", "JSON"],
630
+ label="Export Format",
631
+ value="CSV"
632
+ )
633
+
634
+ export_btn = gr.Button("πŸ“₯ Export Data", variant="secondary")
635
+ export_status = gr.Textbox(label="Export Status", lines=2, interactive=False)
636
+
637
+ gr.HTML("<div class='feature-box'><h3>🧹 Data Management</h3></div>")
638
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All Data", variant="secondary")
639
+ clear_status = gr.Textbox(label="Clear Status", lines=1, interactive=False)
640
+
641
+ # Usage instructions
642
+ with gr.Accordion("πŸ“š Usage Instructions", open=False):
643
+ gr.Markdown("""
644
+ ### How to Use This Tool
645
+
646
+ 1. **Enter URL**: Paste the URL of the article or webpage you want to analyze
647
+ 2. **Adjust Settings**: Set your preferred summary length
648
+ 3. **Extract Content**: Click "Extract & Summarize" to process the content
649
+ 4. **Review Results**: View the extracted metadata, AI summary, and keywords
650
+ 5. **Export Data**: Save your results in CSV or JSON format
651
+
652
+ ### Features
653
+ - πŸ›‘οΈ **Security**: Built-in URL validation and robots.txt compliance
654
+ - πŸ€– **AI Summarization**: Advanced BART model for intelligent summarization
655
+ - πŸ“Š **Rich Metadata**: Author, publication date, reading time, and more
656
+ - 🏷️ **Keyword Extraction**: Automatic identification of key terms
657
+ - πŸ’Ύ **Export Options**: CSV and JSON formats for further analysis
658
+ - πŸ”„ **Batch Processing**: Process multiple URLs and export all results
659
+
660
+ ### Supported Content
661
+ - News articles and blog posts
662
+ - Research papers and reports
663
+ - Documentation and guides
664
+ - Most HTML-based content
665
+
666
+ ### Limitations
667
+ - Respects robots.txt restrictions
668
+ - Cannot access password-protected content
669
+ - Some dynamic content may not be captured
670
+ - Processing time varies with content length
671
+ """)
672
+
673
+ # Event handlers
674
+ scrape_btn.click(
675
+ fn=app.process_url,
676
+ inputs=[url_input, summary_length],
677
+ outputs=[status_output, metadata_output, summary_output, keywords_output]
678
+ )
679
+
680
+ export_btn.click(
681
+ fn=app.export_data,
682
+ inputs=[export_format],
683
+ outputs=[export_status]
684
+ )
685
+
686
+ clear_btn.click(
687
+ fn=app.clear_data,
688
+ outputs=[clear_status]
689
+ )
690
+
691
+ return interface
692
+
693
+ # Launch the application
694
+ if __name__ == "__main__":
695
+ interface = create_interface()
696
+ interface.launch(
697
+ server_name="0.0.0.0",
698
+ server_port=7860,
699
+ share=False,
700
+ show_error=True
701
+ )