๐ค AI-Powered Web Scraper
Professional content extraction and summarization for journalists, analysts, and researchers
""" AI-Powered Web Scraper - app.py Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces """ import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import pandas as pd from datetime import datetime import json import re import time from typing import List, Dict, Optional, Tuple import logging from pathlib import Path import os from dataclasses import dataclass from transformers import pipeline import nltk from nltk.tokenize import sent_tokenize import asyncio import aiohttp from concurrent.futures import ThreadPoolExecutor import hashlib # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', quiet=True) # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class ScrapedContent: """Data class for scraped content with metadata""" url: str title: str content: str summary: str word_count: int reading_time: int extracted_at: str author: Optional[str] = None publish_date: Optional[str] = None meta_description: Optional[str] = None keywords: List[str] = None class SecurityValidator: """Security validation for URLs and content""" ALLOWED_SCHEMES = {'http', 'https'} BLOCKED_DOMAINS = { 'localhost', '127.0.0.1', '0.0.0.0', '192.168.', '10.', '172.16.', '172.17.', '172.18.', '172.19.', '172.20.', '172.21.', '172.22.', '172.23.', '172.24.', '172.25.', '172.26.', '172.27.', '172.28.', '172.29.', '172.30.', '172.31.' } @classmethod def validate_url(cls, url: str) -> Tuple[bool, str]: """Validate URL for security concerns""" try: parsed = urlparse(url) # Check scheme if parsed.scheme not in cls.ALLOWED_SCHEMES: return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed." # Check for blocked domains hostname = parsed.hostname or '' if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS): return False, "Access to internal/local networks is not allowed." # Basic malformed URL check if not parsed.netloc: return False, "Invalid URL format." return True, "URL is valid." except Exception as e: return False, f"URL validation error: {str(e)}" class RobotsTxtChecker: """Check robots.txt compliance""" @staticmethod def can_fetch(url: str, user_agent: str = "*") -> bool: """Check if URL can be fetched according to robots.txt""" try: parsed_url = urlparse(url) robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" response = requests.get(robots_url, timeout=5) if response.status_code == 200: # Simple robots.txt parsing (basic implementation) lines = response.text.split('\n') user_agent_section = False for line in lines: line = line.strip() if line.startswith('User-agent:'): agent = line.split(':', 1)[1].strip() user_agent_section = agent == '*' or agent.lower() == user_agent.lower() elif user_agent_section and line.startswith('Disallow:'): disallowed = line.split(':', 1)[1].strip() if disallowed and url.endswith(disallowed): return False return True except Exception: # If robots.txt can't be fetched, assume allowed return True class ContentExtractor: """Advanced content extraction with multiple strategies""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }) def extract_content(self, url: str) -> Optional[ScrapedContent]: """Extract content from URL with robust error handling""" try: # Security validation is_valid, validation_msg = SecurityValidator.validate_url(url) if not is_valid: raise ValueError(f"Security validation failed: {validation_msg}") # Check robots.txt if not RobotsTxtChecker.can_fetch(url): raise ValueError("robots.txt disallows scraping this URL") # Fetch content response = self.session.get(url, timeout=15) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'html.parser') # Extract metadata title = self._extract_title(soup) author = self._extract_author(soup) publish_date = self._extract_publish_date(soup) meta_description = self._extract_meta_description(soup) # Extract main content content = self._extract_main_content(soup) if not content or len(content.strip()) < 100: raise ValueError("Insufficient content extracted") # Calculate metrics word_count = len(content.split()) reading_time = max(1, word_count // 200) # Average reading speed # Extract keywords keywords = self._extract_keywords(content) return ScrapedContent( url=url, title=title, content=content, summary="", # Will be filled by AI summarizer word_count=word_count, reading_time=reading_time, extracted_at=datetime.now().isoformat(), author=author, publish_date=publish_date, meta_description=meta_description, keywords=keywords ) except Exception as e: logger.error(f"Content extraction failed for {url}: {str(e)}") raise def _extract_title(self, soup: BeautifulSoup) -> str: """Extract page title with fallbacks""" # Try meta og:title first og_title = soup.find('meta', property='og:title') if og_title and og_title.get('content'): return og_title['content'].strip() # Try regular title tag title_tag = soup.find('title') if title_tag: return title_tag.get_text().strip() # Try h1 as fallback h1_tag = soup.find('h1') if h1_tag: return h1_tag.get_text().strip() return "No title found" def _extract_author(self, soup: BeautifulSoup) -> Optional[str]: """Extract author information""" # Try multiple selectors for author author_selectors = [ 'meta[name="author"]', 'meta[property="article:author"]', '.author', '.byline', '[rel="author"]' ] for selector in author_selectors: element = soup.select_one(selector) if element: if element.name == 'meta': return element.get('content', '').strip() else: return element.get_text().strip() return None def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]: """Extract publication date""" date_selectors = [ 'meta[property="article:published_time"]', 'meta[name="publishdate"]', 'time[datetime]', '.publish-date', '.date' ] for selector in date_selectors: element = soup.select_one(selector) if element: if element.name == 'meta': return element.get('content', '').strip() elif element.name == 'time': return element.get('datetime', '').strip() else: return element.get_text().strip() return None def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]: """Extract meta description""" meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc: return meta_desc.get('content', '').strip() og_desc = soup.find('meta', property='og:description') if og_desc: return og_desc.get('content', '').strip() return None def _extract_main_content(self, soup: BeautifulSoup) -> str: """Extract main content with multiple strategies""" # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement', '.ads', '.sidebar']): element.decompose() # Try content-specific selectors first content_selectors = [ 'article', 'main', '.content', '.post-content', '.entry-content', '.article-body', '#content', '.story-body' ] for selector in content_selectors: element = soup.select_one(selector) if element: text = element.get_text(separator=' ', strip=True) if len(text) > 200: # Minimum content threshold return self._clean_text(text) # Fallback: extract from body body = soup.find('body') if body: text = body.get_text(separator=' ', strip=True) return self._clean_text(text) # Last resort: all text return self._clean_text(soup.get_text(separator=' ', strip=True)) def _clean_text(self, text: str) -> str: """Clean extracted text""" # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove common unwanted patterns text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE) text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE) text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE) return text.strip() def _extract_keywords(self, content: str) -> List[str]: """Extract basic keywords from content""" # Simple keyword extraction (can be enhanced with NLP) words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower()) word_freq = {} for word in words: if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']: word_freq[word] = word_freq.get(word, 0) + 1 # Return top 10 keywords sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return [word for word, freq in sorted_words[:10]] class AISummarizer: """AI-powered content summarization""" def __init__(self): self.summarizer = None self._load_model() def _load_model(self): """Load summarization model with error handling""" try: self.summarizer = pipeline( "summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn" ) logger.info("Summarization model loaded successfully") except Exception as e: logger.error(f"Failed to load summarization model: {e}") # Fallback to a smaller model try: self.summarizer = pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6" ) logger.info("Fallback summarization model loaded") except Exception as e2: logger.error(f"Failed to load fallback model: {e2}") self.summarizer = None def summarize(self, content: str, max_length: int = 300) -> str: """Generate AI summary of content""" if not self.summarizer: return self._extractive_summary(content) try: # Split content into chunks if too long max_input_length = 1024 chunks = self._split_content(content, max_input_length) summaries = [] for chunk in chunks: if len(chunk.split()) < 20: # Skip very short chunks continue result = self.summarizer( chunk, max_length=min(max_length, len(chunk.split()) // 2), min_length=30, do_sample=False ) summaries.append(result[0]['summary_text']) # Combine summaries combined = ' '.join(summaries) # If still too long, summarize again if len(combined.split()) > max_length: result = self.summarizer( combined, max_length=max_length, min_length=50, do_sample=False ) return result[0]['summary_text'] return combined except Exception as e: logger.error(f"AI summarization failed: {e}") return self._extractive_summary(content) def _split_content(self, content: str, max_length: int) -> List[str]: """Split content into manageable chunks""" sentences = sent_tokenize(content) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(sentence.split()) if current_length + sentence_length > max_length and current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [sentence] current_length = sentence_length else: current_chunk.append(sentence) current_length += sentence_length if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def _extractive_summary(self, content: str) -> str: """Fallback extractive summarization""" sentences = sent_tokenize(content) if len(sentences) <= 3: return content # Simple extractive approach: take first, middle, and last sentences summary_sentences = [ sentences[0], sentences[len(sentences) // 2], sentences[-1] ] return ' '.join(summary_sentences) class WebScraperApp: """Main application class""" def __init__(self): self.extractor = ContentExtractor() self.summarizer = AISummarizer() self.scraped_data = [] def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]: """Process a single URL and return results""" try: if not url.strip(): return "โ Error", "Please enter a valid URL", "", "" # Add protocol if missing if not url.startswith(('http://', 'https://')): url = 'https://' + url # Extract content with gr.update(): # Show progress scraped_content = self.extractor.extract_content(url) # Generate summary summary = self.summarizer.summarize(scraped_content.content, summary_length) scraped_content.summary = summary # Store result self.scraped_data.append(scraped_content) # Format results metadata = f""" **๐ Content Analysis** - **Title:** {scraped_content.title} - **Author:** {scraped_content.author or 'Not found'} - **Published:** {scraped_content.publish_date or 'Not found'} - **Word Count:** {scraped_content.word_count:,} - **Reading Time:** {scraped_content.reading_time} minutes - **Extracted:** {scraped_content.extracted_at} """ keywords_text = f"**๐ท๏ธ Keywords:** {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else "" return ( "โ Success", metadata, f"**๐ AI Summary ({len(summary.split())} words):**\n\n{summary}", keywords_text ) except Exception as e: error_msg = f"Failed to process URL: {str(e)}" logger.error(error_msg) return "โ Error", error_msg, "", "" def export_data(self, format_type: str) -> str: """Export scraped data to file""" if not self.scraped_data: return "No data to export" try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if format_type == "CSV": filename = f"scraped_data_{timestamp}.csv" df = pd.DataFrame([ { 'URL': item.url, 'Title': item.title, 'Author': item.author, 'Published': item.publish_date, 'Word Count': item.word_count, 'Reading Time': item.reading_time, 'Summary': item.summary, 'Keywords': ', '.join(item.keywords) if item.keywords else '', 'Extracted At': item.extracted_at } for item in self.scraped_data ]) df.to_csv(filename, index=False) elif format_type == "JSON": filename = f"scraped_data_{timestamp}.json" data = [ { 'url': item.url, 'title': item.title, 'content': item.content, 'summary': item.summary, 'metadata': { 'author': item.author, 'publish_date': item.publish_date, 'word_count': item.word_count, 'reading_time': item.reading_time, 'keywords': item.keywords, 'extracted_at': item.extracted_at } } for item in self.scraped_data ] with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return filename except Exception as e: logger.error(f"Export failed: {e}") return f"Export failed: {str(e)}" def clear_data(self) -> str: """Clear all scraped data""" self.scraped_data.clear() return "Data cleared successfully" def create_interface(): """Create the Gradio interface""" app = WebScraperApp() # Custom CSS for professional appearance custom_css = """ .gradio-container { max-width: 1200px; margin: auto; } .main-header { text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem; } .feature-box { background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 8px; padding: 1.5rem; margin: 1rem 0; } .status-success { color: #28a745; font-weight: bold; } .status-error { color: #dc3545; font-weight: bold; } """ with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface: # Header gr.HTML("""
Professional content extraction and summarization for journalists, analysts, and researchers