File size: 5,816 Bytes
8cc50db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# smart_web_analyzer.py
"""
Smart Web Analyzer Plus - Core Functionality

Features:
- Web content fetching with custom User-Agent (to avoid 403 errors)
- Basic HTML cleaning (no removal of script/style)
- Summarization using "facebook/bart-large-cnn"
- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
- Preview text for display
"""

import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# 1) Summarization Pipeline
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
    summarizer = None
    print("Error loading summarization model:", e)

# 2) Sentiment Analysis Pipeline
try:
    sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
except Exception as e:
    sentiment_analyzer = None
    print("Error loading sentiment analysis model:", e)

# 3) Zero-Shot Topic Detection Pipeline
try:
    zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
except Exception as e:
    zero_shot_classifier = None
    print("Error loading topic detection model:", e)


def fetch_web_content(url):
    """
    Fetches the HTML content of a given URL, using a spoofed User-Agent.
    
    Parameters:
        url (str): The URL to fetch.
    
    Returns:
        str: HTML content if successful.
    
    Raises:
        ValueError: if the URL is invalid.
        Exception: if the request fails (network error, 4xx/5xx, etc.).
    """
    # Validate input URL
    if not url.startswith("http://") and not url.startswith("https://"):
        raise ValueError("Invalid URL. URL must start with http:// or https://")
    
    # Spoof common browser User-Agent to reduce 403 errors
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
        )
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises HTTPError for 4XX or 5XX
        return response.text
    except requests.exceptions.RequestException as e:
        # Catch all exceptions from the requests library
        raise Exception(f"Error fetching the URL: {e}")


def clean_text(html_content):
    """
    Cleans HTML content to extract raw text (keeps <script> and <style>).
    
    Parameters:
        html_content (str): The raw HTML content.
    
    Returns:
        str: Cleaned text extracted from the HTML.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # NOTE: We are NOT removing <script> or <style> tags here:
    # for script_or_style in soup(["script", "style"]):
    #     script_or_style.decompose()
    
    text = soup.get_text(separator=" ")
    # Collapse multiple whitespaces
    cleaned_text = " ".join(text.split())
    return cleaned_text


def summarize_text(text, max_length=130, min_length=30):
    """
    Summarizes text using the facebook/bart-large-cnn model.
    
    Parameters:
        text (str): The text to summarize.
        max_length (int): Maximum length for the summary.
        min_length (int): Minimum length for the summary.
    
    Returns:
        str: The summarized text or an error message.
    """
    if not summarizer:
        return "Summarization model is not available."
    
    try:
        summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary_list[0]["summary_text"]
    except Exception as e:
        return f"Error during summarization: {e}"


def analyze_sentiment(text):
    """
    Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
    
    Parameters:
        text (str): Text for sentiment analysis.
    
    Returns:
        str: A label describing sentiment (e.g., '4 stars') or an error message.
    """
    if not sentiment_analyzer:
        return "Sentiment analysis model is not available."
    
    try:
        results = sentiment_analyzer(text)
        # Typically returns a list of results; we grab the first
        label = results[0]["label"]
        return label
    except Exception as e:
        return f"Error during sentiment analysis: {e}"


def detect_topic(text):
    """
    Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
    
    Parameters:
        text (str): The text to analyze.
    
    Returns:
        dict or str: Dictionary of topics & confidence scores OR an error string.
    """
    if not zero_shot_classifier:
        return {"error": "Topic detection model is not available."}
    
    # Example candidate labels
    candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
    
    try:
        result = zero_shot_classifier(text, candidate_labels)
        # result['labels'] are sorted by confidence
        # We'll map each label to its corresponding score
        topics = {
            label: score for label, score 
            in zip(result["labels"], result["scores"])
        }
        return topics
    except Exception as e:
        return {"error": f"Error during topic detection: {e}"}


def preview_clean_text(text, max_chars=500):
    """
    Returns a preview slice of the cleaned text for display.
    
    Parameters:
        text (str): The text to preview.
        max_chars (int): Maximum number of characters in the preview.
    
    Returns:
        str: The truncated text plus ellipsis if it's longer than max_chars.
    """
    if len(text) > max_chars:
        return text[:max_chars] + "..."
    return text

# End of smart_web_analyzer.py