# smart_web_analyzer.py import requests from bs4 import BeautifulSoup from transformers import pipeline import torch from typing import Dict, List, Optional import logging from functools import lru_cache logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class WebAnalyzer: def __init__(self): self.device = 0 if torch.cuda.is_available() else -1 self._models: Dict[str, Optional[pipeline]] = { 'summarize': None, 'sentiment': None, 'topics': None } def _load_model(self, model_type: str) -> None: """Lazy load models only when needed""" if self._models[model_type] is None: logger.info(f"Loading {model_type} model...") if model_type == 'summarize': self._models[model_type] = pipeline( "summarization", model="facebook/bart-large-cnn", device=self.device ) elif model_type == 'sentiment': self._models[model_type] = pipeline( "text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment", device=self.device ) elif model_type == 'topics': self._models[model_type] = pipeline( "zero-shot-classification", model="facebook/bart-large-mnli", device=self.device ) @lru_cache(maxsize=100) def fetch_content(self, url: str) -> str: """Fetch webpage content with caching and better error handling""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', 'Accept-Language': 'en-US,en;q=0.5' } try: response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() return response.text except requests.RequestException as e: logger.error(f"Error fetching URL {url}: {str(e)}") raise ValueError(f"Failed to fetch content: {str(e)}") def clean_html(self, html: str) -> str: """Extract readable text content from HTML""" soup = BeautifulSoup(html, 'html.parser') # Remove script and style elements for script in soup(["script", "style", "meta", "noscript"]): script.decompose() # Extract text while preserving some structure text = soup.get_text(separator='\n', strip=True) # Clean up whitespace lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) return text def analyze(self, url: str, modes: List[str]) -> Dict: """Improved analysis pipeline with better error handling""" results = {} try: # Fetch and clean content html = self.fetch_content(url) cleaned_text = self.clean_html(html) results['clean_text'] = cleaned_text # Validate text length if len(cleaned_text.split()) < 10: raise ValueError("Insufficient text content found on page") # Text chunks for different models summary_text = cleaned_text[:2048] # BART limit classification_text = cleaned_text[:512] # BERT limit for mode in modes: if mode not in self._models: continue self._load_model(mode) if mode == 'summarize': summary = self._models[mode](summary_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text'] results['summary'] = summary elif mode == 'sentiment': sentiment = self._models[mode](classification_text)[0] results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})" elif mode == 'topics': topics = self._models[mode]( classification_text, candidate_labels=[ "Technology", "Artificial Intelligence", "Business", "Science", "Politics", "Health", "Environment", "Education" ] ) results['topics'] = { topic: score for topic, score in zip(topics['labels'], topics['scores']) if score > 0.1 # Filter low confidence topics } except Exception as e: logger.error(f"Analysis error: {str(e)}") results['error'] = str(e) return results # app.py import gradio as gr from smart_web_analyzer import WebAnalyzer analyzer = WebAnalyzer() def format_results(results: Dict) -> Dict: """Format analysis results for Gradio tabs""" outputs = {} if 'error' in results: return { "📜 Clean Text": f"❌ Error: {results['error']}", "📝 Summary": "", "🎭 Sentiment": "", "📊 Topics": "" } # Clean text tab text_preview = results.get('clean_text', 'No text extracted') if len(text_preview) > 1000: text_preview = text_preview[:1000] + "...(truncated)" outputs["📜 Clean Text"] = text_preview # Summary tab if 'summary' in results: outputs["📝 Summary"] = f"**AI Summary:**\n{results['summary']}" else: outputs["📝 Summary"] = "" # Sentiment tab if 'sentiment' in results: outputs["🎭 Sentiment"] = f"**Sentiment Analysis:**\n{results['sentiment']}" else: outputs["🎭 Sentiment"] = "" # Topics tab if 'topics' in results: topics = "\n".join([ f"- **{k}**: {v:.1%}" for k,v in sorted(results['topics'].items(), key=lambda x: x[1], reverse=True) ]) outputs["📊 Topics"] = f"**Detected Topics:**\n{topics}" else: outputs["📊 Topics"] = "" return outputs with gr.Blocks(title="Smart Web Analyzer Plus") as demo: gr.Markdown("# 🌐 Smart Web Analyzer Plus") gr.Markdown("Analyze web content with AI - extract summaries, sentiment, and topics.") with gr.Row(): with gr.Column(scale=4): url_input = gr.Textbox( label="Enter URL", placeholder="https://example.com", show_label=True ) with gr.Column(scale=2): modes = gr.CheckboxGroup( ["summarize", "sentiment", "topics"], label="Analysis Types", value=["summarize"] # Default selection ) with gr.Column(scale=1): submit_btn = gr.Button("Analyze", variant="primary") with gr.Tabs() as tabs: text_tab = gr.Tab("📜 Clean Text") with text_tab: clean_text = gr.Markdown() summary_tab = gr.Tab("📝 Summary") with summary_tab: summary = gr.Markdown() sentiment_tab = gr.Tab("🎭 Sentiment") with sentiment_tab: sentiment = gr.Markdown() topics_tab = gr.Tab("📊 Topics") with topics_tab: topics = gr.Markdown() # Example URLs examples = gr.Examples( examples=[ ["https://www.bbc.com/news/technology-67881954", ["summarize", "sentiment"]], ["https://arxiv.org/html/2312.17296v1", ["topics", "summarize"]] ], inputs=[url_input, modes] ) # Handle submission submit_btn.click( fn=lambda url, m: format_results(analyzer.analyze(url, m)), inputs=[url_input, modes], outputs=[clean_text, summary, sentiment, topics], api_name="analyze" ) # Error handling for empty URL url_input.change( fn=lambda x: gr.update(interactive=bool(x.strip())), inputs=[url_input], outputs=[submit_btn] ) if __name__ == "__main__": demo.launch()