Spaces:
Runtime error
Runtime error
# smart_web_analyzer.py | |
""" | |
Smart Web Analyzer Plus - Core Functionality | |
Features: | |
- Web content fetching with custom User-Agent (to avoid 403 errors) | |
- Basic HTML cleaning (no removal of script/style) | |
- Summarization using "facebook/bart-large-cnn" | |
- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment" | |
- Topic detection via zero-shot classification ("facebook/bart-large-mnli") | |
- Preview text for display | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
# 1) Summarization Pipeline | |
try: | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
except Exception as e: | |
summarizer = None | |
print("Error loading summarization model:", e) | |
# 2) Sentiment Analysis Pipeline | |
try: | |
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment") | |
except Exception as e: | |
sentiment_analyzer = None | |
print("Error loading sentiment analysis model:", e) | |
# 3) Zero-Shot Topic Detection Pipeline | |
try: | |
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
except Exception as e: | |
zero_shot_classifier = None | |
print("Error loading topic detection model:", e) | |
def fetch_web_content(url): | |
""" | |
Fetches the HTML content of a given URL, using a spoofed User-Agent. | |
Parameters: | |
url (str): The URL to fetch. | |
Returns: | |
str: HTML content if successful. | |
Raises: | |
ValueError: if the URL is invalid. | |
Exception: if the request fails (network error, 4xx/5xx, etc.). | |
""" | |
# Validate input URL | |
if not url.startswith("http://") and not url.startswith("https://"): | |
raise ValueError("Invalid URL. URL must start with http:// or https://") | |
# Spoof common browser User-Agent to reduce 403 errors | |
headers = { | |
"User-Agent": ( | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
"(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" | |
) | |
} | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() # Raises HTTPError for 4XX or 5XX | |
return response.text | |
except requests.exceptions.RequestException as e: | |
# Catch all exceptions from the requests library | |
raise Exception(f"Error fetching the URL: {e}") | |
def clean_text(html_content): | |
""" | |
Cleans HTML content to extract raw text (keeps <script> and <style>). | |
Parameters: | |
html_content (str): The raw HTML content. | |
Returns: | |
str: Cleaned text extracted from the HTML. | |
""" | |
soup = BeautifulSoup(html_content, "html.parser") | |
# NOTE: We are NOT removing <script> or <style> tags here: | |
# for script_or_style in soup(["script", "style"]): | |
# script_or_style.decompose() | |
text = soup.get_text(separator=" ") | |
# Collapse multiple whitespaces | |
cleaned_text = " ".join(text.split()) | |
return cleaned_text | |
def summarize_text(text, max_length=130, min_length=30): | |
""" | |
Summarizes text using the facebook/bart-large-cnn model. | |
Parameters: | |
text (str): The text to summarize. | |
max_length (int): Maximum length for the summary. | |
min_length (int): Minimum length for the summary. | |
Returns: | |
str: The summarized text or an error message. | |
""" | |
if not summarizer: | |
return "Summarization model is not available." | |
try: | |
summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) | |
return summary_list[0]["summary_text"] | |
except Exception as e: | |
return f"Error during summarization: {e}" | |
def analyze_sentiment(text): | |
""" | |
Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment. | |
Parameters: | |
text (str): Text for sentiment analysis. | |
Returns: | |
str: A label describing sentiment (e.g., '4 stars') or an error message. | |
""" | |
if not sentiment_analyzer: | |
return "Sentiment analysis model is not available." | |
try: | |
results = sentiment_analyzer(text) | |
# Typically returns a list of results; we grab the first | |
label = results[0]["label"] | |
return label | |
except Exception as e: | |
return f"Error during sentiment analysis: {e}" | |
def detect_topic(text): | |
""" | |
Detects topics in text using zero-shot classification via facebook/bart-large-mnli. | |
Parameters: | |
text (str): The text to analyze. | |
Returns: | |
dict or str: Dictionary of topics & confidence scores OR an error string. | |
""" | |
if not zero_shot_classifier: | |
return {"error": "Topic detection model is not available."} | |
# Example candidate labels | |
candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"] | |
try: | |
result = zero_shot_classifier(text, candidate_labels) | |
# result['labels'] are sorted by confidence | |
# We'll map each label to its corresponding score | |
topics = { | |
label: score for label, score | |
in zip(result["labels"], result["scores"]) | |
} | |
return topics | |
except Exception as e: | |
return {"error": f"Error during topic detection: {e}"} | |
def preview_clean_text(text, max_chars=500): | |
""" | |
Returns a preview slice of the cleaned text for display. | |
Parameters: | |
text (str): The text to preview. | |
max_chars (int): Maximum number of characters in the preview. | |
Returns: | |
str: The truncated text plus ellipsis if it's longer than max_chars. | |
""" | |
if len(text) > max_chars: | |
return text[:max_chars] + "..." | |
return text | |
# End of smart_web_analyzer.py | |