Spaces:

MHamdan
/

ContentAnalyzer

Sleeping

App Files Files Community

MHamdan commited on Feb 15

Commit

c2c731a

verified ·

1 Parent(s): 523e9ce

app

Browse files

Files changed (1) hide show

app.py +93 -99

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from transformers import pipeline
 import PyPDF2
@@ -9,15 +10,18 @@ from typing import List, Optional
 class ContentAnalyzer:
     def __init__(self):
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
         try:
             if file_ext == '.txt':
                 return file_obj.read().decode('utf-8')
@@ -37,82 +41,60 @@ class ContentAnalyzer:
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
-            return "\n".join(line for line in lines if line)
         except Exception as e:
             return f"Error fetching URL: {str(e)}"
     def analyze_content(
         self,
-        text: Optional[str] = None,
-        url: Optional[str] = None,
-        file: Optional[object] = None,
-        analysis_types: List[str] = ["summarize"],
-        progress_callback=None
     ) -> dict:
-        try:
-            # STEP 1: Retrieve content
-            if progress_callback:
-                progress_callback(1, "Reading input")
-            if url:
-                content = self.fetch_web_content(url)
-            elif file:
-                content = self.read_file(file)
-            else:
-                content = text or ""
-            if not content or content.startswith("Error"):
-                return {"error": content or "No content provided"}
-            truncated = content[:1000] + "..." if len(content) > 1000 else content
-            results = {"original_text": truncated}
-            # STEP 2: Summarize
-            if "summarize" in analysis_types:
-                if progress_callback:
-                    progress_callback(2, "Summarizing content")
-                summary = self.summarizer(content[:1024], max_length=130, min_length=30)
-                results["summary"] = summary[0]['summary_text']
-            # STEP 3: Sentiment
-            if "sentiment" in analysis_types:
-                if progress_callback:
-                    progress_callback(3, "Performing sentiment analysis")
-                sentiment = self.sentiment_analyzer(content[:512])
-                results["sentiment"] = {
-                    "label": sentiment[0]['label'],
-                    "score": round(sentiment[0]['score'], 3)
-                }
-            # STEP 4: Topics
-            if "topics" in analysis_types:
-                if progress_callback:
-                    progress_callback(4, "Identifying topics")
-                topics = self.zero_shot(
-                    content[:512],
-                    candidate_labels=[
-                        "technology", "science", "business", "politics",
-                        "entertainment", "education", "health", "sports"
-                    ]
-                )
-                results["topics"] = [
-                    {"label": label, "score": round(score, 3)}
-                    for label, score in zip(topics['labels'], topics['scores'])
-                    if score > 0.1
                 ]
-            return results
-        except Exception as e:
-            return {"error": f"Analysis error: {str(e)}"}
 def create_interface():
@@ -120,49 +102,52 @@ def create_interface():
     with gr.Blocks(title="Content Analyzer") as demo:
         gr.Markdown("# 📑 Content Analyzer")
-        gr.Markdown("Analyze text content from various sources using AI.")
-        # Dropdown to choose input type
         input_choice = gr.Dropdown(
             choices=["Text", "URL", "File"],
             value="Text",
             label="Select Input Type"
         )
-        # Containers for each input type
         with gr.Column(visible=True) as text_col:
             text_input = gr.Textbox(
                 label="Enter Text",
                 placeholder="Paste your text here...",
                 lines=5
             )
         with gr.Column(visible=False) as url_col:
             url_input = gr.Textbox(
                 label="Enter URL",
                 placeholder="https://example.com"
             )
         with gr.Column(visible=False) as file_col:
             file_input = gr.File(
                 label="Upload File",
                 file_types=[".txt", ".pdf", ".docx"]
             )
-        # Callback function to show/hide input columns
         def show_inputs(choice):
             return {
                 text_col: choice == "Text",
                 url_col: choice == "URL",
                 file_col: choice == "File"
             }
-        # Trigger showing/hiding based on the dropdown choice
         input_choice.change(
             fn=show_inputs,
             inputs=[input_choice],
             outputs=[text_col, url_col, file_col]
         )
-        # Analysis Options
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
@@ -171,7 +156,7 @@ def create_interface():
         analyze_btn = gr.Button("Analyze", variant="primary")
-        # Output Sections in tabs
         with gr.Tabs():
             with gr.Tab("Original Text"):
                 original_text = gr.Markdown()
@@ -182,40 +167,46 @@ def create_interface():
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
-        def process_analysis(choice, text, url, file, types, progress=gr.Progress()):
-            """Orchestrates analysis depending on input choice."""
-            steps_total = 4
-            def progress_callback(step: int, desc: str):
-                progress(step, total=steps_total, desc=desc)
-            # Determine which content to pass based on the input choice
-            if choice == "Text":
-                content_text = text
-                content_url = None
-                content_file = None
-            elif choice == "URL":
-                content_text = None
-                content_url = url
-                content_file = None
-            else:  # choice == "File"
-                content_text = None
-                content_url = None
-                content_file = file
-            # Perform analysis
-            results = analyzer.analyze_content(
-                text=content_text,
-                url=content_url,
-                file=content_file,
-                analysis_types=types,
-                progress_callback=progress_callback
-            )
             if "error" in results:
                 return results["error"], "", "", ""
-            # Format outputs
             original = results.get("original_text", "")
             summary = results.get("summary", "")
             sentiment = ""
@@ -225,7 +216,10 @@ def create_interface():
             topics = ""
             if "topics" in results:
-                t_list = "\n".join([f"- {t['label']}: {t['score']}" for t in results["topics"]])
                 topics = "**Detected Topics:**\n" + t_list
             return original, summary, sentiment, topics
@@ -234,7 +228,7 @@ def create_interface():
             fn=process_analysis,
             inputs=[input_choice, text_input, url_input, file_input, analysis_types],
             outputs=[original_text, summary_output, sentiment_output, topics_output],
-            show_progress=True
         )
     return demo

 import gradio as gr
 import requests
+import time
 from bs4 import BeautifulSoup
 from transformers import pipeline
 import PyPDF2
 class ContentAnalyzer:
     def __init__(self):
+        print("[DEBUG] Initializing pipelines...")
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
+        print("[DEBUG] Pipelines initialized.")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
+        print(f"[DEBUG] File extension: {file_ext}")
         try:
             if file_ext == '.txt':
                 return file_obj.read().decode('utf-8')
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
+        print(f"[DEBUG] Attempting to fetch URL: {url}")
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove scripts and styles
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
+            final_text = "\n".join(line for line in lines if line)
+            return final_text
         except Exception as e:
             return f"Error fetching URL: {str(e)}"
     def analyze_content(
         self,
+        content: str,
+        analysis_types: List[str],
     ) -> dict:
+        """Perform summarization, sentiment analysis, and topic detection on `content`."""
+        results = {}
+        truncated = content[:1000] + "..." if len(content) > 1000 else content
+        results["original_text"] = truncated
+        # Summarize
+        if "summarize" in analysis_types:
+            summary = self.summarizer(content[:1024], max_length=130, min_length=30)
+            results["summary"] = summary[0]['summary_text']
+        # Sentiment
+        if "sentiment" in analysis_types:
+            sentiment = self.sentiment_analyzer(content[:512])
+            results["sentiment"] = {
+                "label": sentiment[0]['label'],
+                "score": round(sentiment[0]['score'], 3)
+            }
+        # Topics
+        if "topics" in analysis_types:
+            topics = self.zero_shot(
+                content[:512],
+                candidate_labels=[
+                    "technology", "science", "business", "politics",
+                    "entertainment", "education", "health", "sports"
                 ]
+            )
+            results["topics"] = [
+                {"label": label, "score": round(score, 3)}
+                for label, score in zip(topics['labels'], topics['scores'])
+                if score > 0.1
+            ]
+        return results
 def create_interface():
     with gr.Blocks(title="Content Analyzer") as demo:
         gr.Markdown("# 📑 Content Analyzer")
+        gr.Markdown(
+            "Analyze text from **Text**, **URL**, or **File** with summarization, "
+            "sentiment, and topic detection. A progress bar will appear during processing."
+        )
+        # Dropdown for input type
         input_choice = gr.Dropdown(
             choices=["Text", "URL", "File"],
             value="Text",
             label="Select Input Type"
         )
+        # We use three separate columns to conditionally display
         with gr.Column(visible=True) as text_col:
             text_input = gr.Textbox(
                 label="Enter Text",
                 placeholder="Paste your text here...",
                 lines=5
             )
         with gr.Column(visible=False) as url_col:
             url_input = gr.Textbox(
                 label="Enter URL",
                 placeholder="https://example.com"
             )
         with gr.Column(visible=False) as file_col:
             file_input = gr.File(
                 label="Upload File",
                 file_types=[".txt", ".pdf", ".docx"]
             )
         def show_inputs(choice):
+            """Return a dict mapping columns to booleans for visibility."""
             return {
                 text_col: choice == "Text",
                 url_col: choice == "URL",
                 file_col: choice == "File"
             }
         input_choice.change(
             fn=show_inputs,
             inputs=[input_choice],
             outputs=[text_col, url_col, file_col]
         )
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
         analyze_btn = gr.Button("Analyze", variant="primary")
+        # Output tabs
         with gr.Tabs():
             with gr.Tab("Original Text"):
                 original_text = gr.Markdown()
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
+        def process_analysis(choice, text_val, url_val, file_val, types):
+            """
+            This function does everything in one place using a 'with gr.Progress() as p:' block,
+            so we can show each step of the process. We add time.sleep(1) just to demonstrate
+            the progress bar (otherwise it may appear/disappear too quickly).
+            """
+            with gr.Progress() as p:
+                # STEP 1: Retrieve content
+                p(0, total=4, desc="Reading input")
+                time.sleep(1)  # For demonstration
+                if choice == "Text":
+                    content = text_val or ""
+                elif choice == "URL":
+                    content = analyzer.fetch_web_content(url_val or "")
+                else:  # File
+                    content = analyzer.read_file(file_val)
+                if not content or content.startswith("Error"):
+                    return content or "No content provided", "", "", ""
+                # STEP 2: Summarize
+                p(1, total=4, desc="Summarizing content")
+                time.sleep(1)  # For demonstration
+                # STEP 3: Sentiment
+                p(2, total=4, desc="Performing sentiment analysis")
+                time.sleep(1)  # For demonstration
+                # STEP 4: Topics
+                p(3, total=4, desc="Identifying topics")
+                time.sleep(1)  # For demonstration
+            # After the progress steps, do the actual analysis in one shot
+            # (You could interleave the calls to pipeline with each progress step
+            # if you want real-time progress. This is a simplified approach.)
+            results = analyzer.analyze_content(content, types)
             if "error" in results:
                 return results["error"], "", "", ""
             original = results.get("original_text", "")
             summary = results.get("summary", "")
             sentiment = ""
             topics = ""
             if "topics" in results:
+                t_list = "\n".join([
+                    f"- {t['label']}: {t['score']}"
+                    for t in results["topics"]
+                ])
                 topics = "**Detected Topics:**\n" + t_list
             return original, summary, sentiment, topics
             fn=process_analysis,
             inputs=[input_choice, text_input, url_input, file_input, analysis_types],
             outputs=[original_text, summary_output, sentiment_output, topics_output],
+            show_progress=True  # This ensures the Gradio progress bar is enabled
         )
     return demo