Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

AI_Powered_Web_Scraper / app.py

MagicMeWizard

Create app.py

35f9333 verified 7 days ago

raw

history blame

25.6 kB

	"""
	AI-Powered Web Scraper - app.py
	Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces
	"""

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import pandas as pd
	from datetime import datetime
	import json
	import re
	import time
	from typing import List, Dict, Optional, Tuple
	import logging
	from pathlib import Path
	import os
	from dataclasses import dataclass
	from transformers import pipeline
	import nltk
	from nltk.tokenize import sent_tokenize
	import asyncio
	import aiohttp
	from concurrent.futures import ThreadPoolExecutor
	import hashlib

	# Download required NLTK data
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt', quiet=True)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	@dataclass
	class ScrapedContent:
	"""Data class for scraped content with metadata"""
	url: str
	title: str
	content: str
	summary: str
	word_count: int
	reading_time: int
	extracted_at: str
	author: Optional[str] = None
	publish_date: Optional[str] = None
	meta_description: Optional[str] = None
	keywords: List[str] = None

	class SecurityValidator:
	"""Security validation for URLs and content"""

	ALLOWED_SCHEMES = {'http', 'https'}
	BLOCKED_DOMAINS = {
	'localhost', '127.0.0.1', '0.0.0.0',
	'192.168.', '10.', '172.16.', '172.17.',
	'172.18.', '172.19.', '172.20.', '172.21.',
	'172.22.', '172.23.', '172.24.', '172.25.',
	'172.26.', '172.27.', '172.28.', '172.29.',
	'172.30.', '172.31.'
	}

	@classmethod
	def validate_url(cls, url: str) -> Tuple[bool, str]:
	"""Validate URL for security concerns"""
	try:
	parsed = urlparse(url)

	# Check scheme
	if parsed.scheme not in cls.ALLOWED_SCHEMES:
	return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed."

	# Check for blocked domains
	hostname = parsed.hostname or ''
	if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
	return False, "Access to internal/local networks is not allowed."

	# Basic malformed URL check
	if not parsed.netloc:
	return False, "Invalid URL format."

	return True, "URL is valid."

	except Exception as e:
	return False, f"URL validation error: {str(e)}"

	class RobotsTxtChecker:
	"""Check robots.txt compliance"""

	@staticmethod
	def can_fetch(url: str, user_agent: str = "*") -> bool:
	"""Check if URL can be fetched according to robots.txt"""
	try:
	parsed_url = urlparse(url)
	robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"

	response = requests.get(robots_url, timeout=5)
	if response.status_code == 200:
	# Simple robots.txt parsing (basic implementation)
	lines = response.text.split('\n')
	user_agent_section = False

	for line in lines:
	line = line.strip()
	if line.startswith('User-agent:'):
	agent = line.split(':', 1)[1].strip()
	user_agent_section = agent == '*' or agent.lower() == user_agent.lower()
	elif user_agent_section and line.startswith('Disallow:'):
	disallowed = line.split(':', 1)[1].strip()
	if disallowed and url.endswith(disallowed):
	return False

	return True

	except Exception:
	# If robots.txt can't be fetched, assume allowed
	return True

	class ContentExtractor:
	"""Advanced content extraction with multiple strategies"""

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	})

	def extract_content(self, url: str) -> Optional[ScrapedContent]:
	"""Extract content from URL with robust error handling"""
	try:
	# Security validation
	is_valid, validation_msg = SecurityValidator.validate_url(url)
	if not is_valid:
	raise ValueError(f"Security validation failed: {validation_msg}")

	# Check robots.txt
	if not RobotsTxtChecker.can_fetch(url):
	raise ValueError("robots.txt disallows scraping this URL")

	# Fetch content
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	# Parse HTML
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract metadata
	title = self._extract_title(soup)
	author = self._extract_author(soup)
	publish_date = self._extract_publish_date(soup)
	meta_description = self._extract_meta_description(soup)

	# Extract main content
	content = self._extract_main_content(soup)

	if not content or len(content.strip()) < 100:
	raise ValueError("Insufficient content extracted")

	# Calculate metrics
	word_count = len(content.split())
	reading_time = max(1, word_count // 200) # Average reading speed

	# Extract keywords
	keywords = self._extract_keywords(content)

	return ScrapedContent(
	url=url,
	title=title,
	content=content,
	summary="", # Will be filled by AI summarizer
	word_count=word_count,
	reading_time=reading_time,
	extracted_at=datetime.now().isoformat(),
	author=author,
	publish_date=publish_date,
	meta_description=meta_description,
	keywords=keywords
	)

	except Exception as e:
	logger.error(f"Content extraction failed for {url}: {str(e)}")
	raise

	def _extract_title(self, soup: BeautifulSoup) -> str:
	"""Extract page title with fallbacks"""
	# Try meta og:title first
	og_title = soup.find('meta', property='og:title')
	if og_title and og_title.get('content'):
	return og_title['content'].strip()

	# Try regular title tag
	title_tag = soup.find('title')
	if title_tag:
	return title_tag.get_text().strip()

	# Try h1 as fallback
	h1_tag = soup.find('h1')
	if h1_tag:
	return h1_tag.get_text().strip()

	return "No title found"

	def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
	"""Extract author information"""
	# Try multiple selectors for author
	author_selectors = [
	'meta[name="author"]',
	'meta[property="article:author"]',
	'.author',
	'.byline',
	'[rel="author"]'
	]

	for selector in author_selectors:
	element = soup.select_one(selector)
	if element:
	if element.name == 'meta':
	return element.get('content', '').strip()
	else:
	return element.get_text().strip()

	return None

	def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
	"""Extract publication date"""
	date_selectors = [
	'meta[property="article:published_time"]',
	'meta[name="publishdate"]',
	'time[datetime]',
	'.publish-date',
	'.date'
	]

	for selector in date_selectors:
	element = soup.select_one(selector)
	if element:
	if element.name == 'meta':
	return element.get('content', '').strip()
	elif element.name == 'time':
	return element.get('datetime', '').strip()
	else:
	return element.get_text().strip()

	return None

	def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
	"""Extract meta description"""
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	if meta_desc:
	return meta_desc.get('content', '').strip()

	og_desc = soup.find('meta', property='og:description')
	if og_desc:
	return og_desc.get('content', '').strip()

	return None

	def _extract_main_content(self, soup: BeautifulSoup) -> str:
	"""Extract main content with multiple strategies"""
	# Remove unwanted elements
	for element in soup(['script', 'style', 'nav', 'header', 'footer',
	'aside', 'advertisement', '.ads', '.sidebar']):
	element.decompose()

	# Try content-specific selectors first
	content_selectors = [
	'article',
	'main',
	'.content',
	'.post-content',
	'.entry-content',
	'.article-body',
	'#content',
	'.story-body'
	]

	for selector in content_selectors:
	element = soup.select_one(selector)
	if element:
	text = element.get_text(separator=' ', strip=True)
	if len(text) > 200: # Minimum content threshold
	return self._clean_text(text)

	# Fallback: extract from body
	body = soup.find('body')
	if body:
	text = body.get_text(separator=' ', strip=True)
	return self._clean_text(text)

	# Last resort: all text
	return self._clean_text(soup.get_text(separator=' ', strip=True))

	def _clean_text(self, text: str) -> str:
	"""Clean extracted text"""
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove common unwanted patterns
	text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
	text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
	text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE)

	return text.strip()

	def _extract_keywords(self, content: str) -> List[str]:
	"""Extract basic keywords from content"""
	# Simple keyword extraction (can be enhanced with NLP)
	words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower())
	word_freq = {}

	for word in words:
	if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']:
	word_freq[word] = word_freq.get(word, 0) + 1

	# Return top 10 keywords
	sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
	return [word for word, freq in sorted_words[:10]]

	class AISummarizer:
	"""AI-powered content summarization"""

	def __init__(self):
	self.summarizer = None
	self._load_model()

	def _load_model(self):
	"""Load summarization model with error handling"""
	try:
	self.summarizer = pipeline(
	"summarization",
	model="facebook/bart-large-cnn",
	tokenizer="facebook/bart-large-cnn"
	)
	logger.info("Summarization model loaded successfully")
	except Exception as e:
	logger.error(f"Failed to load summarization model: {e}")
	# Fallback to a smaller model
	try:
	self.summarizer = pipeline(
	"summarization",
	model="sshleifer/distilbart-cnn-12-6"
	)
	logger.info("Fallback summarization model loaded")
	except Exception as e2:
	logger.error(f"Failed to load fallback model: {e2}")
	self.summarizer = None

	def summarize(self, content: str, max_length: int = 300) -> str:
	"""Generate AI summary of content"""
	if not self.summarizer:
	return self._extractive_summary(content)

	try:
	# Split content into chunks if too long
	max_input_length = 1024
	chunks = self._split_content(content, max_input_length)

	summaries = []
	for chunk in chunks:
	if len(chunk.split()) < 20: # Skip very short chunks
	continue

	result = self.summarizer(
	chunk,
	max_length=min(max_length, len(chunk.split()) // 2),
	min_length=30,
	do_sample=False
	)
	summaries.append(result[0]['summary_text'])

	# Combine summaries
	combined = ' '.join(summaries)

	# If still too long, summarize again
	if len(combined.split()) > max_length:
	result = self.summarizer(
	combined,
	max_length=max_length,
	min_length=50,
	do_sample=False
	)
	return result[0]['summary_text']

	return combined

	except Exception as e:
	logger.error(f"AI summarization failed: {e}")
	return self._extractive_summary(content)

	def _split_content(self, content: str, max_length: int) -> List[str]:
	"""Split content into manageable chunks"""
	sentences = sent_tokenize(content)
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence_length = len(sentence.split())
	if current_length + sentence_length > max_length and current_chunk:
	chunks.append(' '.join(current_chunk))
	current_chunk = [sentence]
	current_length = sentence_length
	else:
	current_chunk.append(sentence)
	current_length += sentence_length

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def _extractive_summary(self, content: str) -> str:
	"""Fallback extractive summarization"""
	sentences = sent_tokenize(content)
	if len(sentences) <= 3:
	return content

	# Simple extractive approach: take first, middle, and last sentences
	summary_sentences = [
	sentences[0],
	sentences[len(sentences) // 2],
	sentences[-1]
	]

	return ' '.join(summary_sentences)

	class WebScraperApp:
	"""Main application class"""

	def __init__(self):
	self.extractor = ContentExtractor()
	self.summarizer = AISummarizer()
	self.scraped_data = []

	def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]:
	"""Process a single URL and return results"""
	try:
	if not url.strip():
	return "❌ Error", "Please enter a valid URL", "", ""

	# Add protocol if missing
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	# Extract content
	with gr.update(): # Show progress
	scraped_content = self.extractor.extract_content(url)

	# Generate summary
	summary = self.summarizer.summarize(scraped_content.content, summary_length)
	scraped_content.summary = summary

	# Store result
	self.scraped_data.append(scraped_content)

	# Format results
	metadata = f"""
	📊 Content Analysis
	- Title: {scraped_content.title}
	- Author: {scraped_content.author or 'Not found'}
	- Published: {scraped_content.publish_date or 'Not found'}
	- Word Count: {scraped_content.word_count:,}
	- Reading Time: {scraped_content.reading_time} minutes
	- Extracted: {scraped_content.extracted_at}
	"""

	keywords_text = f"🏷️ Keywords: {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else ""

	return (
	"✅ Success",
	metadata,
	f"📝 AI Summary ({len(summary.split())} words):\n\n{summary}",
	keywords_text
	)

	except Exception as e:
	error_msg = f"Failed to process URL: {str(e)}"
	logger.error(error_msg)
	return "❌ Error", error_msg, "", ""

	def export_data(self, format_type: str) -> str:
	"""Export scraped data to file"""
	if not self.scraped_data:
	return "No data to export"

	try:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	if format_type == "CSV":
	filename = f"scraped_data_{timestamp}.csv"
	df = pd.DataFrame([
	{
	'URL': item.url,
	'Title': item.title,
	'Author': item.author,
	'Published': item.publish_date,
	'Word Count': item.word_count,
	'Reading Time': item.reading_time,
	'Summary': item.summary,
	'Keywords': ', '.join(item.keywords) if item.keywords else '',
	'Extracted At': item.extracted_at
	}
	for item in self.scraped_data
	])
	df.to_csv(filename, index=False)

	elif format_type == "JSON":
	filename = f"scraped_data_{timestamp}.json"
	data = [
	{
	'url': item.url,
	'title': item.title,
	'content': item.content,
	'summary': item.summary,
	'metadata': {
	'author': item.author,
	'publish_date': item.publish_date,
	'word_count': item.word_count,
	'reading_time': item.reading_time,
	'keywords': item.keywords,
	'extracted_at': item.extracted_at
	}
	}
	for item in self.scraped_data
	]
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)

	return filename

	except Exception as e:
	logger.error(f"Export failed: {e}")
	return f"Export failed: {str(e)}"

	def clear_data(self) -> str:
	"""Clear all scraped data"""
	self.scraped_data.clear()
	return "Data cleared successfully"

	def create_interface():
	"""Create the Gradio interface"""
	app = WebScraperApp()

	# Custom CSS for professional appearance
	custom_css = """
	.gradio-container {
	max-width: 1200px;
	margin: auto;
	}
	.main-header {
	text-align: center;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 2rem;
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	.feature-box {
	background: #f8f9fa;
	border: 1px solid #e9ecef;
	border-radius: 8px;
	padding: 1.5rem;
	margin: 1rem 0;
	}
	.status-success {
	color: #28a745;
	font-weight: bold;
	}
	.status-error {
	color: #dc3545;
	font-weight: bold;
	}
	"""

	with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface:

	# Header
	gr.HTML("""
	<div class="main-header">
	<h1>🤖 AI-Powered Web Scraper</h1>
	<p>Professional content extraction and summarization for journalists, analysts, and researchers</p>
	</div>
	""")

	# Main interface
	with gr.Row():
	with gr.Column(scale=2):
	# Input section
	gr.HTML("<div class='feature-box'><h3>📡 Content Extraction</h3></div>")

	url_input = gr.Textbox(
	label="Enter URL to scrape",
	placeholder="https://example.com/article",
	lines=1
	)

	with gr.Row():
	summary_length = gr.Slider(
	minimum=100,
	maximum=500,
	value=300,
	step=50,
	label="Summary Length (words)"
	)

	scrape_btn = gr.Button("🚀 Extract & Summarize", variant="primary", size="lg")

	# Results section
	gr.HTML("<div class='feature-box'><h3>📊 Results</h3></div>")

	status_output = gr.Textbox(label="Status", lines=1, interactive=False)
	metadata_output = gr.Markdown(label="Metadata")
	summary_output = gr.Markdown(label="AI Summary")
	keywords_output = gr.Markdown(label="Keywords")

	with gr.Column(scale=1):
	# Export section
	gr.HTML("<div class='feature-box'><h3>💾 Export Options</h3></div>")

	export_format = gr.Radio(
	choices=["CSV", "JSON"],
	label="Export Format",
	value="CSV"
	)

	export_btn = gr.Button("📥 Export Data", variant="secondary")
	export_status = gr.Textbox(label="Export Status", lines=2, interactive=False)

	gr.HTML("<div class='feature-box'><h3>🧹 Data Management</h3></div>")
	clear_btn = gr.Button("🗑️ Clear All Data", variant="secondary")
	clear_status = gr.Textbox(label="Clear Status", lines=1, interactive=False)

	# Usage instructions
	with gr.Accordion("📚 Usage Instructions", open=False):
	gr.Markdown("""
	### How to Use This Tool

	1. Enter URL: Paste the URL of the article or webpage you want to analyze
	2. Adjust Settings: Set your preferred summary length
	3. Extract Content: Click "Extract & Summarize" to process the content
	4. Review Results: View the extracted metadata, AI summary, and keywords
	5. Export Data: Save your results in CSV or JSON format

	### Features
	- 🛡️ Security: Built-in URL validation and robots.txt compliance
	- 🤖 AI Summarization: Advanced BART model for intelligent summarization
	- 📊 Rich Metadata: Author, publication date, reading time, and more
	- 🏷️ Keyword Extraction: Automatic identification of key terms
	- 💾 Export Options: CSV and JSON formats for further analysis
	- 🔄 Batch Processing: Process multiple URLs and export all results

	### Supported Content
	- News articles and blog posts
	- Research papers and reports
	- Documentation and guides
	- Most HTML-based content

	### Limitations
	- Respects robots.txt restrictions
	- Cannot access password-protected content
	- Some dynamic content may not be captured
	- Processing time varies with content length
	""")

	# Event handlers
	scrape_btn.click(
	fn=app.process_url,
	inputs=[url_input, summary_length],
	outputs=[status_output, metadata_output, summary_output, keywords_output]
	)

	export_btn.click(
	fn=app.export_data,
	inputs=[export_format],
	outputs=[export_status]
	)

	clear_btn.click(
	fn=app.clear_data,
	outputs=[clear_status]
	)

	return interface

	# Launch the application
	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)