Spaces:
Runtime error
Runtime error
File size: 5,816 Bytes
8cc50db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# smart_web_analyzer.py
"""
Smart Web Analyzer Plus - Core Functionality
Features:
- Web content fetching with custom User-Agent (to avoid 403 errors)
- Basic HTML cleaning (no removal of script/style)
- Summarization using "facebook/bart-large-cnn"
- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
- Preview text for display
"""
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# 1) Summarization Pipeline
try:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
summarizer = None
print("Error loading summarization model:", e)
# 2) Sentiment Analysis Pipeline
try:
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
except Exception as e:
sentiment_analyzer = None
print("Error loading sentiment analysis model:", e)
# 3) Zero-Shot Topic Detection Pipeline
try:
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
except Exception as e:
zero_shot_classifier = None
print("Error loading topic detection model:", e)
def fetch_web_content(url):
"""
Fetches the HTML content of a given URL, using a spoofed User-Agent.
Parameters:
url (str): The URL to fetch.
Returns:
str: HTML content if successful.
Raises:
ValueError: if the URL is invalid.
Exception: if the request fails (network error, 4xx/5xx, etc.).
"""
# Validate input URL
if not url.startswith("http://") and not url.startswith("https://"):
raise ValueError("Invalid URL. URL must start with http:// or https://")
# Spoof common browser User-Agent to reduce 403 errors
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raises HTTPError for 4XX or 5XX
return response.text
except requests.exceptions.RequestException as e:
# Catch all exceptions from the requests library
raise Exception(f"Error fetching the URL: {e}")
def clean_text(html_content):
"""
Cleans HTML content to extract raw text (keeps <script> and <style>).
Parameters:
html_content (str): The raw HTML content.
Returns:
str: Cleaned text extracted from the HTML.
"""
soup = BeautifulSoup(html_content, "html.parser")
# NOTE: We are NOT removing <script> or <style> tags here:
# for script_or_style in soup(["script", "style"]):
# script_or_style.decompose()
text = soup.get_text(separator=" ")
# Collapse multiple whitespaces
cleaned_text = " ".join(text.split())
return cleaned_text
def summarize_text(text, max_length=130, min_length=30):
"""
Summarizes text using the facebook/bart-large-cnn model.
Parameters:
text (str): The text to summarize.
max_length (int): Maximum length for the summary.
min_length (int): Minimum length for the summary.
Returns:
str: The summarized text or an error message.
"""
if not summarizer:
return "Summarization model is not available."
try:
summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
return summary_list[0]["summary_text"]
except Exception as e:
return f"Error during summarization: {e}"
def analyze_sentiment(text):
"""
Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
Parameters:
text (str): Text for sentiment analysis.
Returns:
str: A label describing sentiment (e.g., '4 stars') or an error message.
"""
if not sentiment_analyzer:
return "Sentiment analysis model is not available."
try:
results = sentiment_analyzer(text)
# Typically returns a list of results; we grab the first
label = results[0]["label"]
return label
except Exception as e:
return f"Error during sentiment analysis: {e}"
def detect_topic(text):
"""
Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
Parameters:
text (str): The text to analyze.
Returns:
dict or str: Dictionary of topics & confidence scores OR an error string.
"""
if not zero_shot_classifier:
return {"error": "Topic detection model is not available."}
# Example candidate labels
candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
try:
result = zero_shot_classifier(text, candidate_labels)
# result['labels'] are sorted by confidence
# We'll map each label to its corresponding score
topics = {
label: score for label, score
in zip(result["labels"], result["scores"])
}
return topics
except Exception as e:
return {"error": f"Error during topic detection: {e}"}
def preview_clean_text(text, max_chars=500):
"""
Returns a preview slice of the cleaned text for display.
Parameters:
text (str): The text to preview.
max_chars (int): Maximum number of characters in the preview.
Returns:
str: The truncated text plus ellipsis if it's longer than max_chars.
"""
if len(text) > max_chars:
return text[:max_chars] + "..."
return text
# End of smart_web_analyzer.py
|