Spaces:

MHamdan
/

ContentAnalyzer

Running

File size: 9,142 Bytes

# app.py

import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import PyPDF2
import docx
import os
import time
from typing import List, Tuple, Optional

class ContentAnalyzer:
    def __init__(self):
        print("[DEBUG] Initializing pipelines...")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.sentiment_analyzer = pipeline("sentiment-analysis")
        self.zero_shot = pipeline("zero-shot-classification")
        print("[DEBUG] Pipelines initialized.")

    def read_file(self, file_obj) -> str:
        """Read content from different file types."""
        if file_obj is None:
            print("[DEBUG] No file uploaded.")
            return ""

        file_ext = os.path.splitext(file_obj.name)[1].lower()
        print(f"[DEBUG] Uploaded file extension detected: {file_ext}")

        try:
            if file_ext == '.txt':
                content = file_obj.read().decode('utf-8')
                print("[DEBUG] Successfully read .txt file.")
                return content

            elif file_ext == '.pdf':
                # Note: For PyPDF2 >= 3.0.0, this usage is valid
                pdf_reader = PyPDF2.PdfReader(file_obj)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                print("[DEBUG] Successfully read .pdf file.")
                return text

            elif file_ext == '.docx':
                doc = docx.Document(file_obj)
                paragraphs = [paragraph.text for paragraph in doc.paragraphs]
                print("[DEBUG] Successfully read .docx file.")
                return "\n".join(paragraphs)

            else:
                msg = f"Unsupported file type: {file_ext}"
                print("[DEBUG]", msg)
                return msg

        except Exception as e:
            error_msg = f"Error reading file: {str(e)}"
            print("[DEBUG]", error_msg)
            return error_msg

    def fetch_web_content(self, url: str) -> str:
        """Fetch content from URL."""
        print(f"[DEBUG] Attempting to fetch URL: {url}")
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove scripts and styles
            for script in soup(["script", "style"]):
                script.decompose()

            text = soup.get_text(separator='\n')
            lines = (line.strip() for line in text.splitlines())
            final_text = "\n".join(line for line in lines if line)
            print("[DEBUG] Successfully fetched and cleaned web content.")
            return final_text

        except Exception as e:
            error_msg = f"Error fetching URL: {str(e)}"
            print("[DEBUG]", error_msg)
            return error_msg

    def analyze_content(
        self, 
        text: Optional[str] = None,
        url: Optional[str] = None,
        file: Optional[object] = None,
        analysis_types: List[str] = ["summarize"],
        progress_callback=None
    ) -> dict:
        """
        Analyze content from text, URL, or file.
        progress_callback is a function for updating progress steps.
        """
        try:
            # Step 1: Retrieve content
            if progress_callback:
                progress_callback(1, "Reading input...")

            if url:
                content = self.fetch_web_content(url)
            elif file:
                content = self.read_file(file)
            else:
                content = text or ""

            if not content or content.startswith("Error"):
                return {"error": content or "No content provided"}

            # Truncate for debug
            truncated = content[:1000] + "..." if len(content) > 1000 else content
            results = {"original_text": truncated}

            # Step 2: Summarize
            if "summarize" in analysis_types:
                if progress_callback:
                    progress_callback(2, "Summarizing content...")
                summary = self.summarizer(content[:1024], max_length=130, min_length=30)
                results["summary"] = summary[0]['summary_text']

            # Step 3: Sentiment
            if "sentiment" in analysis_types:
                if progress_callback:
                    progress_callback(3, "Performing sentiment analysis...")
                sentiment = self.sentiment_analyzer(content[:512])
                results["sentiment"] = {
                    "label": sentiment[0]['label'],
                    "score": round(sentiment[0]['score'], 3)
                }

            # Step 4: Topics
            if "topics" in analysis_types:
                if progress_callback:
                    progress_callback(4, "Identifying topics...")
                topics = self.zero_shot(
                    content[:512],
                    candidate_labels=[
                        "technology", "science", "business", "politics",
                        "entertainment", "education", "health", "sports"
                    ]
                )
                results["topics"] = [
                    {"label": label, "score": round(score, 3)}
                    for label, score in zip(topics['labels'], topics['scores'])
                    if score > 0.1
                ]

            return results

        except Exception as e:
            error_msg = f"Analysis error: {str(e)}"
            print("[DEBUG]", error_msg)
            return {"error": error_msg}


def create_interface():
    analyzer = ContentAnalyzer()

    with gr.Blocks(title="Content Analyzer") as demo:
        gr.Markdown("# 📑 Content Analyzer")
        gr.Markdown("Analyze text content from various sources using AI.")

        with gr.Tabs():
            # Text Input Tab
            with gr.Tab("Text Input"):
                text_input = gr.Textbox(
                    label="Enter Text",
                    placeholder="Paste your text here...",
                    lines=5
                )

            # URL Input Tab
            with gr.Tab("Web URL"):
                url_input = gr.Textbox(
                    label="Enter URL",
                    placeholder="https://example.com"
                )

            # File Upload Tab
            with gr.Tab("File Upload"):
                file_input = gr.File(
                    label="Upload File",
                    file_types=[".txt", ".pdf", ".docx"]
                )

        # Analysis Options
        analysis_types = gr.CheckboxGroup(
            choices=["summarize", "sentiment", "topics"],
            value=["summarize"],
            label="Analysis Types"
        )

        analyze_btn = gr.Button("Analyze", variant="primary")

        # Output Sections
        with gr.Tabs():
            with gr.Tab("Original Text"):
                original_text = gr.Markdown()
            with gr.Tab("Summary"):
                summary_output = gr.Markdown()
            with gr.Tab("Sentiment"):
                sentiment_output = gr.Markdown()
            with gr.Tab("Topics"):
                topics_output = gr.Markdown()

        def process_analysis(text, url, file, types, progress=gr.Progress()):
            """
            This function is wrapped by gradio to handle user inputs.
            We use progress to show step-by-step updates.
            """
            steps_total = 4  # We have up to 4 possible steps

            def progress_callback(step, desc):
                progress((step, desc), total=steps_total)
            
            results = analyzer.analyze_content(
                text=text, 
                url=url, 
                file=file, 
                analysis_types=types,
                progress_callback=progress_callback
            )

            # If there's an error, show it in "Original Text" tab for clarity
            if "error" in results:
                return results["error"], "", "", ""

            # Format outputs
            original = results.get("original_text", "")
            summary = results.get("summary", "")
            
            sentiment = ""
            if "sentiment" in results:
                sent = results["sentiment"]
                sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"

            topics = ""
            if "topics" in results:
                topics_list = "\n".join([
                    f"- {t['label']}: {t['score']}"
                    for t in results["topics"]
                ])
                topics = "**Detected Topics:**\n" + topics_list

            return original, summary, sentiment, topics

        analyze_btn.click(
            fn=process_analysis,
            inputs=[text_input, url_input, file_input, analysis_types],
            outputs=[original_text, summary_output, sentiment_output, topics_output],
            show_progress=True  # Enable the progress bar in Gradio
        )

    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()