# smart_web_analyzer.py """ Smart Web Analyzer Plus - Core Functionality Features: - Web content fetching with custom User-Agent (to avoid 403 errors) - Basic HTML cleaning (no removal of script/style) - Summarization using "facebook/bart-large-cnn" - Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment" - Topic detection via zero-shot classification ("facebook/bart-large-mnli") - Preview text for display """ import requests from bs4 import BeautifulSoup from transformers import pipeline # 1) Summarization Pipeline try: summarizer = pipeline("summarization", model="facebook/bart-large-cnn") except Exception as e: summarizer = None print("Error loading summarization model:", e) # 2) Sentiment Analysis Pipeline try: sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment") except Exception as e: sentiment_analyzer = None print("Error loading sentiment analysis model:", e) # 3) Zero-Shot Topic Detection Pipeline try: zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") except Exception as e: zero_shot_classifier = None print("Error loading topic detection model:", e) def fetch_web_content(url): """ Fetches the HTML content of a given URL, using a spoofed User-Agent. Parameters: url (str): The URL to fetch. Returns: str: HTML content if successful. Raises: ValueError: if the URL is invalid. Exception: if the request fails (network error, 4xx/5xx, etc.). """ # Validate input URL if not url.startswith("http://") and not url.startswith("https://"): raise ValueError("Invalid URL. URL must start with http:// or https://") # Spoof common browser User-Agent to reduce 403 errors headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" ) } try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Raises HTTPError for 4XX or 5XX return response.text except requests.exceptions.RequestException as e: # Catch all exceptions from the requests library raise Exception(f"Error fetching the URL: {e}") def clean_text(html_content): """ Cleans HTML content to extract raw text (keeps