Spaces:
Sleeping
Sleeping
# smart_web_analyzer.py | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
import torch | |
from typing import Dict, List, Optional | |
import logging | |
from functools import lru_cache | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebAnalyzer: | |
def __init__(self): | |
self.device = 0 if torch.cuda.is_available() else -1 | |
self._models: Dict[str, Optional[pipeline]] = { | |
'summarize': None, | |
'sentiment': None, | |
'topics': None | |
} | |
def _load_model(self, model_type: str) -> None: | |
"""Lazy load models only when needed""" | |
if self._models[model_type] is None: | |
logger.info(f"Loading {model_type} model...") | |
if model_type == 'summarize': | |
self._models[model_type] = pipeline( | |
"summarization", | |
model="facebook/bart-large-cnn", | |
device=self.device | |
) | |
elif model_type == 'sentiment': | |
self._models[model_type] = pipeline( | |
"text-classification", | |
model="nlptown/bert-base-multilingual-uncased-sentiment", | |
device=self.device | |
) | |
elif model_type == 'topics': | |
self._models[model_type] = pipeline( | |
"zero-shot-classification", | |
model="facebook/bart-large-mnli", | |
device=self.device | |
) | |
def fetch_content(self, url: str) -> str: | |
"""Fetch webpage content with caching and better error handling""" | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', | |
'Accept-Language': 'en-US,en;q=0.5' | |
} | |
try: | |
response = requests.get(url, headers=headers, timeout=15) | |
response.raise_for_status() | |
return response.text | |
except requests.RequestException as e: | |
logger.error(f"Error fetching URL {url}: {str(e)}") | |
raise ValueError(f"Failed to fetch content: {str(e)}") | |
def clean_html(self, html: str) -> str: | |
"""Extract readable text content from HTML""" | |
soup = BeautifulSoup(html, 'html.parser') | |
# Remove script and style elements | |
for script in soup(["script", "style", "meta", "noscript"]): | |
script.decompose() | |
# Extract text while preserving some structure | |
text = soup.get_text(separator='\n', strip=True) | |
# Clean up whitespace | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = '\n'.join(chunk for chunk in chunks if chunk) | |
return text | |
def analyze(self, url: str, modes: List[str]) -> Dict: | |
"""Improved analysis pipeline with better error handling""" | |
results = {} | |
try: | |
# Fetch and clean content | |
html = self.fetch_content(url) | |
cleaned_text = self.clean_html(html) | |
results['clean_text'] = cleaned_text | |
# Validate text length | |
if len(cleaned_text.split()) < 10: | |
raise ValueError("Insufficient text content found on page") | |
# Text chunks for different models | |
summary_text = cleaned_text[:2048] # BART limit | |
classification_text = cleaned_text[:512] # BERT limit | |
for mode in modes: | |
if mode not in self._models: | |
continue | |
self._load_model(mode) | |
if mode == 'summarize': | |
summary = self._models[mode](summary_text, | |
max_length=150, | |
min_length=30, | |
do_sample=False)[0]['summary_text'] | |
results['summary'] = summary | |
elif mode == 'sentiment': | |
sentiment = self._models[mode](classification_text)[0] | |
results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})" | |
elif mode == 'topics': | |
topics = self._models[mode]( | |
classification_text, | |
candidate_labels=[ | |
"Technology", "Artificial Intelligence", | |
"Business", "Science", "Politics", | |
"Health", "Environment", "Education" | |
] | |
) | |
results['topics'] = { | |
topic: score | |
for topic, score in zip(topics['labels'], topics['scores']) | |
if score > 0.1 # Filter low confidence topics | |
} | |
except Exception as e: | |
logger.error(f"Analysis error: {str(e)}") | |
results['error'] = str(e) | |
return results | |
# app.py | |
import gradio as gr | |
from smart_web_analyzer import WebAnalyzer | |
analyzer = WebAnalyzer() | |
def format_results(results: Dict) -> Dict: | |
"""Format analysis results for Gradio tabs""" | |
outputs = {} | |
if 'error' in results: | |
return { | |
"π Clean Text": f"β Error: {results['error']}", | |
"π Summary": "", | |
"π Sentiment": "", | |
"π Topics": "" | |
} | |
# Clean text tab | |
text_preview = results.get('clean_text', 'No text extracted') | |
if len(text_preview) > 1000: | |
text_preview = text_preview[:1000] + "...(truncated)" | |
outputs["π Clean Text"] = text_preview | |
# Summary tab | |
if 'summary' in results: | |
outputs["π Summary"] = f"**AI Summary:**\n{results['summary']}" | |
else: | |
outputs["π Summary"] = "" | |
# Sentiment tab | |
if 'sentiment' in results: | |
outputs["π Sentiment"] = f"**Sentiment Analysis:**\n{results['sentiment']}" | |
else: | |
outputs["π Sentiment"] = "" | |
# Topics tab | |
if 'topics' in results: | |
topics = "\n".join([ | |
f"- **{k}**: {v:.1%}" | |
for k,v in sorted(results['topics'].items(), | |
key=lambda x: x[1], reverse=True) | |
]) | |
outputs["π Topics"] = f"**Detected Topics:**\n{topics}" | |
else: | |
outputs["π Topics"] = "" | |
return outputs | |
with gr.Blocks(title="Smart Web Analyzer Plus") as demo: | |
gr.Markdown("# π Smart Web Analyzer Plus") | |
gr.Markdown("Analyze web content with AI - extract summaries, sentiment, and topics.") | |
with gr.Row(): | |
with gr.Column(scale=4): | |
url_input = gr.Textbox( | |
label="Enter URL", | |
placeholder="https://example.com", | |
show_label=True | |
) | |
with gr.Column(scale=2): | |
modes = gr.CheckboxGroup( | |
["summarize", "sentiment", "topics"], | |
label="Analysis Types", | |
value=["summarize"] # Default selection | |
) | |
with gr.Column(scale=1): | |
submit_btn = gr.Button("Analyze", variant="primary") | |
with gr.Tabs() as tabs: | |
text_tab = gr.Tab("π Clean Text") | |
with text_tab: | |
clean_text = gr.Markdown() | |
summary_tab = gr.Tab("π Summary") | |
with summary_tab: | |
summary = gr.Markdown() | |
sentiment_tab = gr.Tab("π Sentiment") | |
with sentiment_tab: | |
sentiment = gr.Markdown() | |
topics_tab = gr.Tab("π Topics") | |
with topics_tab: | |
topics = gr.Markdown() | |
# Example URLs | |
examples = gr.Examples( | |
examples=[ | |
["https://www.bbc.com/news/technology-67881954", ["summarize", "sentiment"]], | |
["https://arxiv.org/html/2312.17296v1", ["topics", "summarize"]] | |
], | |
inputs=[url_input, modes] | |
) | |
# Handle submission | |
submit_btn.click( | |
fn=lambda url, m: format_results(analyzer.analyze(url, m)), | |
inputs=[url_input, modes], | |
outputs=[clean_text, summary, sentiment, topics], | |
api_name="analyze" | |
) | |
# Error handling for empty URL | |
url_input.change( | |
fn=lambda x: gr.update(interactive=bool(x.strip())), | |
inputs=[url_input], | |
outputs=[submit_btn] | |
) | |
if __name__ == "__main__": | |
demo.launch() |