Spaces:

MHamdan
/

ContentAnalyzer

Running

App Files Files Community

MHamdan commited on Feb 15

Commit

18d6761

verified ·

1 Parent(s): 5f0fd24

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -55

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # app.py
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
@@ -6,70 +7,97 @@ from transformers import pipeline
 import PyPDF2
 import docx
 import os
 from typing import List, Tuple, Optional
-from smolagents import CodeAgent, HfApiModel, Tool
 class ContentAnalyzer:
     def __init__(self):
-        # Initialize models
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
         try:
             if file_ext == '.txt':
-                return file_obj.read().decode('utf-8')
             elif file_ext == '.pdf':
                 pdf_reader = PyPDF2.PdfReader(file_obj)
                 text = ""
                 for page in pdf_reader.pages:
                     text += page.extract_text() + "\n"
                 return text
             elif file_ext == '.docx':
                 doc = docx.Document(file_obj)
-                return "\n".join([paragraph.text for paragraph in doc.paragraphs])
             else:
-                return f"Unsupported file type: {file_ext}"
         except Exception as e:
-            return f"Error reading file: {str(e)}"
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove scripts and styles
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
-            return "\n".join(line for line in lines if line)
         except Exception as e:
-            return f"Error fetching URL: {str(e)}"
-    def analyze_content(self,
-                       text: Optional[str] = None,
-                       url: Optional[str] = None,
-                       file: Optional[object] = None,
-                       analysis_types: List[str] = ["summarize"]) -> dict:
-        """Analyze content from text, URL, or file."""
         try:
-            # Get content from appropriate source
             if url:
                 content = self.fetch_web_content(url)
             elif file:
@@ -80,28 +108,37 @@ class ContentAnalyzer:
             if not content or content.startswith("Error"):
                 return {"error": content or "No content provided"}
-            results = {
-                "original_text": content[:1000] + "..." if len(content) > 1000 else content
-            }
-            # Perform requested analyses
             if "summarize" in analysis_types:
                 summary = self.summarizer(content[:1024], max_length=130, min_length=30)
                 results["summary"] = summary[0]['summary_text']
             if "sentiment" in analysis_types:
                 sentiment = self.sentiment_analyzer(content[:512])
                 results["sentiment"] = {
                     "label": sentiment[0]['label'],
                     "score": round(sentiment[0]['score'], 3)
                 }
             if "topics" in analysis_types:
                 topics = self.zero_shot(
                     content[:512],
-                    candidate_labels=["technology", "science", "business",
-                                    "politics", "entertainment", "education",
-                                    "health", "sports"]
                 )
                 results["topics"] = [
                     {"label": label, "score": round(score, 3)}
@@ -112,15 +149,18 @@ class ContentAnalyzer:
             return results
         except Exception as e:
-            return {"error": f"Analysis error: {str(e)}"}
 def create_interface():
     analyzer = ContentAnalyzer()
     with gr.Blocks(title="Content Analyzer") as demo:
         gr.Markdown("# 📑 Content Analyzer")
         gr.Markdown("Analyze text content from various sources using AI.")
         with gr.Tabs():
             # Text Input Tab
             with gr.Tab("Text Input"):
@@ -129,30 +169,30 @@ def create_interface():
                     placeholder="Paste your text here...",
                     lines=5
                 )
             # URL Input Tab
             with gr.Tab("Web URL"):
                 url_input = gr.Textbox(
                     label="Enter URL",
                     placeholder="https://example.com"
                 )
             # File Upload Tab
             with gr.Tab("File Upload"):
                 file_input = gr.File(
                     label="Upload File",
                     file_types=[".txt", ".pdf", ".docx"]
                 )
         # Analysis Options
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
             label="Analysis Types"
         )
         analyze_btn = gr.Button("Analyze", variant="primary")
         # Output Sections
         with gr.Tabs():
             with gr.Tab("Original Text"):
@@ -163,14 +203,29 @@ def create_interface():
                 sentiment_output = gr.Markdown()
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
-        def process_analysis(text, url, file, types):
-            # Get analysis results
-            results = analyzer.analyze_content(text, url, file, types)
             if "error" in results:
                 return results["error"], "", "", ""
             # Format outputs
             original = results.get("original_text", "")
             summary = results.get("summary", "")
@@ -179,26 +234,26 @@ def create_interface():
             if "sentiment" in results:
                 sent = results["sentiment"]
                 sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"
             topics = ""
             if "topics" in results:
-                topics = "**Detected Topics:**\n" + "\n".join([
                     f"- {t['label']}: {t['score']}"
                     for t in results["topics"]
                 ])
             return original, summary, sentiment, topics
-        # Connect the interface
         analyze_btn.click(
             fn=process_analysis,
             inputs=[text_input, url_input, file_input, analysis_types],
-            outputs=[original_text, summary_output, sentiment_output, topics_output]
         )
     return demo
-# Launch the app
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 # app.py
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import PyPDF2
 import docx
 import os
+import time
 from typing import List, Tuple, Optional
 class ContentAnalyzer:
     def __init__(self):
+        print("[DEBUG] Initializing pipelines...")
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
+        print("[DEBUG] Pipelines initialized.")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
+            print("[DEBUG] No file uploaded.")
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
+        print(f"[DEBUG] Uploaded file extension detected: {file_ext}")
         try:
             if file_ext == '.txt':
+                content = file_obj.read().decode('utf-8')
+                print("[DEBUG] Successfully read .txt file.")
+                return content
             elif file_ext == '.pdf':
+                # Note: For PyPDF2 >= 3.0.0, this usage is valid
                 pdf_reader = PyPDF2.PdfReader(file_obj)
                 text = ""
                 for page in pdf_reader.pages:
                     text += page.extract_text() + "\n"
+                print("[DEBUG] Successfully read .pdf file.")
                 return text
             elif file_ext == '.docx':
                 doc = docx.Document(file_obj)
+                paragraphs = [paragraph.text for paragraph in doc.paragraphs]
+                print("[DEBUG] Successfully read .docx file.")
+                return "\n".join(paragraphs)
             else:
+                msg = f"Unsupported file type: {file_ext}"
+                print("[DEBUG]", msg)
+                return msg
         except Exception as e:
+            error_msg = f"Error reading file: {str(e)}"
+            print("[DEBUG]", error_msg)
+            return error_msg
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
+        print(f"[DEBUG] Attempting to fetch URL: {url}")
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove scripts and styles
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
+            final_text = "\n".join(line for line in lines if line)
+            print("[DEBUG] Successfully fetched and cleaned web content.")
+            return final_text
         except Exception as e:
+            error_msg = f"Error fetching URL: {str(e)}"
+            print("[DEBUG]", error_msg)
+            return error_msg
+    def analyze_content(
+        self,
+        text: Optional[str] = None,
+        url: Optional[str] = None,
+        file: Optional[object] = None,
+        analysis_types: List[str] = ["summarize"],
+        progress_callback=None
+    ) -> dict:
+        """
+        Analyze content from text, URL, or file.
+        progress_callback is a function for updating progress steps.
+        """
         try:
+            # Step 1: Retrieve content
+            if progress_callback:
+                progress_callback(1, "Reading input...")
             if url:
                 content = self.fetch_web_content(url)
             elif file:
             if not content or content.startswith("Error"):
                 return {"error": content or "No content provided"}
+            # Truncate for debug
+            truncated = content[:1000] + "..." if len(content) > 1000 else content
+            results = {"original_text": truncated}
+            # Step 2: Summarize
             if "summarize" in analysis_types:
+                if progress_callback:
+                    progress_callback(2, "Summarizing content...")
                 summary = self.summarizer(content[:1024], max_length=130, min_length=30)
                 results["summary"] = summary[0]['summary_text']
+            # Step 3: Sentiment
             if "sentiment" in analysis_types:
+                if progress_callback:
+                    progress_callback(3, "Performing sentiment analysis...")
                 sentiment = self.sentiment_analyzer(content[:512])
                 results["sentiment"] = {
                     "label": sentiment[0]['label'],
                     "score": round(sentiment[0]['score'], 3)
                 }
+            # Step 4: Topics
             if "topics" in analysis_types:
+                if progress_callback:
+                    progress_callback(4, "Identifying topics...")
                 topics = self.zero_shot(
                     content[:512],
+                    candidate_labels=[
+                        "technology", "science", "business", "politics",
+                        "entertainment", "education", "health", "sports"
+                    ]
                 )
                 results["topics"] = [
                     {"label": label, "score": round(score, 3)}
             return results
         except Exception as e:
+            error_msg = f"Analysis error: {str(e)}"
+            print("[DEBUG]", error_msg)
+            return {"error": error_msg}
 def create_interface():
     analyzer = ContentAnalyzer()
     with gr.Blocks(title="Content Analyzer") as demo:
         gr.Markdown("# 📑 Content Analyzer")
         gr.Markdown("Analyze text content from various sources using AI.")
         with gr.Tabs():
             # Text Input Tab
             with gr.Tab("Text Input"):
                     placeholder="Paste your text here...",
                     lines=5
                 )
             # URL Input Tab
             with gr.Tab("Web URL"):
                 url_input = gr.Textbox(
                     label="Enter URL",
                     placeholder="https://example.com"
                 )
             # File Upload Tab
             with gr.Tab("File Upload"):
                 file_input = gr.File(
                     label="Upload File",
                     file_types=[".txt", ".pdf", ".docx"]
                 )
         # Analysis Options
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
             label="Analysis Types"
         )
         analyze_btn = gr.Button("Analyze", variant="primary")
         # Output Sections
         with gr.Tabs():
             with gr.Tab("Original Text"):
                 sentiment_output = gr.Markdown()
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
+        def process_analysis(text, url, file, types, progress=gr.Progress()):
+            """
+            This function is wrapped by gradio to handle user inputs.
+            We use progress to show step-by-step updates.
+            """
+            steps_total = 4  # We have up to 4 possible steps
+            def progress_callback(step, desc):
+                progress((step, desc), total=steps_total)
+            results = analyzer.analyze_content(
+                text=text,
+                url=url,
+                file=file,
+                analysis_types=types,
+                progress_callback=progress_callback
+            )
+            # If there's an error, show it in "Original Text" tab for clarity
             if "error" in results:
                 return results["error"], "", "", ""
             # Format outputs
             original = results.get("original_text", "")
             summary = results.get("summary", "")
             if "sentiment" in results:
                 sent = results["sentiment"]
                 sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"
             topics = ""
             if "topics" in results:
+                topics_list = "\n".join([
                     f"- {t['label']}: {t['score']}"
                     for t in results["topics"]
                 ])
+                topics = "**Detected Topics:**\n" + topics_list
             return original, summary, sentiment, topics
         analyze_btn.click(
             fn=process_analysis,
             inputs=[text_input, url_input, file_input, analysis_types],
+            outputs=[original_text, summary_output, sentiment_output, topics_output],
+            show_progress=True  # Enable the progress bar in Gradio
         )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()