Spaces:

MHamdan
/

smart-web-analyzer-plus

Runtime error

File size: 5,816 Bytes

8cc50db

# smart_web_analyzer.py
"""
Smart Web Analyzer Plus - Core Functionality

Features:
- Web content fetching with custom User-Agent (to avoid 403 errors)
- Basic HTML cleaning (no removal of script/style)
- Summarization using "facebook/bart-large-cnn"
- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
- Preview text for display
"""

import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# 1) Summarization Pipeline
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
    summarizer = None
    print("Error loading summarization model:", e)

# 2) Sentiment Analysis Pipeline
try:
    sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
except Exception as e:
    sentiment_analyzer = None
    print("Error loading sentiment analysis model:", e)

# 3) Zero-Shot Topic Detection Pipeline
try:
    zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
except Exception as e:
    zero_shot_classifier = None
    print("Error loading topic detection model:", e)


def fetch_web_content(url):
    """
    Fetches the HTML content of a given URL, using a spoofed User-Agent.
    
    Parameters:
        url (str): The URL to fetch.
    
    Returns:
        str: HTML content if successful.
    
    Raises:
        ValueError: if the URL is invalid.
        Exception: if the request fails (network error, 4xx/5xx, etc.).
    """
    # Validate input URL
    if not url.startswith("http://") and not url.startswith("https://"):
        raise ValueError("Invalid URL. URL must start with http:// or https://")
    
    # Spoof common browser User-Agent to reduce 403 errors
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
        )
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises HTTPError for 4XX or 5XX
        return response.text
    except requests.exceptions.RequestException as e:
        # Catch all exceptions from the requests library
        raise Exception(f"Error fetching the URL: {e}")


def clean_text(html_content):
    """
    Cleans HTML content to extract raw text (keeps <script> and <style>).
    
    Parameters:
        html_content (str): The raw HTML content.
    
    Returns:
        str: Cleaned text extracted from the HTML.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # NOTE: We are NOT removing <script> or <style> tags here:
    # for script_or_style in soup(["script", "style"]):
    #     script_or_style.decompose()
    
    text = soup.get_text(separator=" ")
    # Collapse multiple whitespaces
    cleaned_text = " ".join(text.split())
    return cleaned_text


def summarize_text(text, max_length=130, min_length=30):
    """
    Summarizes text using the facebook/bart-large-cnn model.
    
    Parameters:
        text (str): The text to summarize.
        max_length (int): Maximum length for the summary.
        min_length (int): Minimum length for the summary.
    
    Returns:
        str: The summarized text or an error message.
    """
    if not summarizer:
        return "Summarization model is not available."
    
    try:
        summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary_list[0]["summary_text"]
    except Exception as e:
        return f"Error during summarization: {e}"


def analyze_sentiment(text):
    """
    Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
    
    Parameters:
        text (str): Text for sentiment analysis.
    
    Returns:
        str: A label describing sentiment (e.g., '4 stars') or an error message.
    """
    if not sentiment_analyzer:
        return "Sentiment analysis model is not available."
    
    try:
        results = sentiment_analyzer(text)
        # Typically returns a list of results; we grab the first
        label = results[0]["label"]
        return label
    except Exception as e:
        return f"Error during sentiment analysis: {e}"


def detect_topic(text):
    """
    Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
    
    Parameters:
        text (str): The text to analyze.
    
    Returns:
        dict or str: Dictionary of topics & confidence scores OR an error string.
    """
    if not zero_shot_classifier:
        return {"error": "Topic detection model is not available."}
    
    # Example candidate labels
    candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
    
    try:
        result = zero_shot_classifier(text, candidate_labels)
        # result['labels'] are sorted by confidence
        # We'll map each label to its corresponding score
        topics = {
            label: score for label, score 
            in zip(result["labels"], result["scores"])
        }
        return topics
    except Exception as e:
        return {"error": f"Error during topic detection: {e}"}


def preview_clean_text(text, max_chars=500):
    """
    Returns a preview slice of the cleaned text for display.
    
    Parameters:
        text (str): The text to preview.
        max_chars (int): Maximum number of characters in the preview.
    
    Returns:
        str: The truncated text plus ellipsis if it's longer than max_chars.
    """
    if len(text) > max_chars:
        return text[:max_chars] + "..."
    return text

# End of smart_web_analyzer.py