smart-web-analyzer-plus / smart_web_analyzer.py
MHamdan's picture
Update files
8cc50db verified
# smart_web_analyzer.py
"""
Smart Web Analyzer Plus - Core Functionality
Features:
- Web content fetching with custom User-Agent (to avoid 403 errors)
- Basic HTML cleaning (no removal of script/style)
- Summarization using "facebook/bart-large-cnn"
- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
- Preview text for display
"""
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# 1) Summarization Pipeline
try:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
summarizer = None
print("Error loading summarization model:", e)
# 2) Sentiment Analysis Pipeline
try:
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
except Exception as e:
sentiment_analyzer = None
print("Error loading sentiment analysis model:", e)
# 3) Zero-Shot Topic Detection Pipeline
try:
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
except Exception as e:
zero_shot_classifier = None
print("Error loading topic detection model:", e)
def fetch_web_content(url):
"""
Fetches the HTML content of a given URL, using a spoofed User-Agent.
Parameters:
url (str): The URL to fetch.
Returns:
str: HTML content if successful.
Raises:
ValueError: if the URL is invalid.
Exception: if the request fails (network error, 4xx/5xx, etc.).
"""
# Validate input URL
if not url.startswith("http://") and not url.startswith("https://"):
raise ValueError("Invalid URL. URL must start with http:// or https://")
# Spoof common browser User-Agent to reduce 403 errors
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raises HTTPError for 4XX or 5XX
return response.text
except requests.exceptions.RequestException as e:
# Catch all exceptions from the requests library
raise Exception(f"Error fetching the URL: {e}")
def clean_text(html_content):
"""
Cleans HTML content to extract raw text (keeps <script> and <style>).
Parameters:
html_content (str): The raw HTML content.
Returns:
str: Cleaned text extracted from the HTML.
"""
soup = BeautifulSoup(html_content, "html.parser")
# NOTE: We are NOT removing <script> or <style> tags here:
# for script_or_style in soup(["script", "style"]):
# script_or_style.decompose()
text = soup.get_text(separator=" ")
# Collapse multiple whitespaces
cleaned_text = " ".join(text.split())
return cleaned_text
def summarize_text(text, max_length=130, min_length=30):
"""
Summarizes text using the facebook/bart-large-cnn model.
Parameters:
text (str): The text to summarize.
max_length (int): Maximum length for the summary.
min_length (int): Minimum length for the summary.
Returns:
str: The summarized text or an error message.
"""
if not summarizer:
return "Summarization model is not available."
try:
summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
return summary_list[0]["summary_text"]
except Exception as e:
return f"Error during summarization: {e}"
def analyze_sentiment(text):
"""
Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
Parameters:
text (str): Text for sentiment analysis.
Returns:
str: A label describing sentiment (e.g., '4 stars') or an error message.
"""
if not sentiment_analyzer:
return "Sentiment analysis model is not available."
try:
results = sentiment_analyzer(text)
# Typically returns a list of results; we grab the first
label = results[0]["label"]
return label
except Exception as e:
return f"Error during sentiment analysis: {e}"
def detect_topic(text):
"""
Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
Parameters:
text (str): The text to analyze.
Returns:
dict or str: Dictionary of topics & confidence scores OR an error string.
"""
if not zero_shot_classifier:
return {"error": "Topic detection model is not available."}
# Example candidate labels
candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
try:
result = zero_shot_classifier(text, candidate_labels)
# result['labels'] are sorted by confidence
# We'll map each label to its corresponding score
topics = {
label: score for label, score
in zip(result["labels"], result["scores"])
}
return topics
except Exception as e:
return {"error": f"Error during topic detection: {e}"}
def preview_clean_text(text, max_chars=500):
"""
Returns a preview slice of the cleaned text for display.
Parameters:
text (str): The text to preview.
max_chars (int): Maximum number of characters in the preview.
Returns:
str: The truncated text plus ellipsis if it's longer than max_chars.
"""
if len(text) > max_chars:
return text[:max_chars] + "..."
return text
# End of smart_web_analyzer.py