# app.py import gradio as gr import requests from bs4 import BeautifulSoup from transformers import pipeline import PyPDF2 import docx import os import time from typing import List, Tuple, Optional class ContentAnalyzer: def __init__(self): print("[DEBUG] Initializing pipelines...") self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") self.sentiment_analyzer = pipeline("sentiment-analysis") self.zero_shot = pipeline("zero-shot-classification") print("[DEBUG] Pipelines initialized.") def read_file(self, file_obj) -> str: """Read content from different file types.""" if file_obj is None: print("[DEBUG] No file uploaded.") return "" file_ext = os.path.splitext(file_obj.name)[1].lower() print(f"[DEBUG] Uploaded file extension detected: {file_ext}") try: if file_ext == '.txt': content = file_obj.read().decode('utf-8') print("[DEBUG] Successfully read .txt file.") return content elif file_ext == '.pdf': # Note: For PyPDF2 >= 3.0.0, this usage is valid pdf_reader = PyPDF2.PdfReader(file_obj) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" print("[DEBUG] Successfully read .pdf file.") return text elif file_ext == '.docx': doc = docx.Document(file_obj) paragraphs = [paragraph.text for paragraph in doc.paragraphs] print("[DEBUG] Successfully read .docx file.") return "\n".join(paragraphs) else: msg = f"Unsupported file type: {file_ext}" print("[DEBUG]", msg) return msg except Exception as e: error_msg = f"Error reading file: {str(e)}" print("[DEBUG]", error_msg) return error_msg def fetch_web_content(self, url: str) -> str: """Fetch content from URL.""" print(f"[DEBUG] Attempting to fetch URL: {url}") try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Remove scripts and styles for script in soup(["script", "style"]): script.decompose() text = soup.get_text(separator='\n') lines = (line.strip() for line in text.splitlines()) final_text = "\n".join(line for line in lines if line) print("[DEBUG] Successfully fetched and cleaned web content.") return final_text except Exception as e: error_msg = f"Error fetching URL: {str(e)}" print("[DEBUG]", error_msg) return error_msg def analyze_content( self, text: Optional[str] = None, url: Optional[str] = None, file: Optional[object] = None, analysis_types: List[str] = ["summarize"], progress_callback=None ) -> dict: """ Analyze content from text, URL, or file. progress_callback is a function for updating progress steps. """ try: # Step 1: Retrieve content if progress_callback: progress_callback(1, "Reading input...") if url: content = self.fetch_web_content(url) elif file: content = self.read_file(file) else: content = text or "" if not content or content.startswith("Error"): return {"error": content or "No content provided"} # Truncate for debug truncated = content[:1000] + "..." if len(content) > 1000 else content results = {"original_text": truncated} # Step 2: Summarize if "summarize" in analysis_types: if progress_callback: progress_callback(2, "Summarizing content...") summary = self.summarizer(content[:1024], max_length=130, min_length=30) results["summary"] = summary[0]['summary_text'] # Step 3: Sentiment if "sentiment" in analysis_types: if progress_callback: progress_callback(3, "Performing sentiment analysis...") sentiment = self.sentiment_analyzer(content[:512]) results["sentiment"] = { "label": sentiment[0]['label'], "score": round(sentiment[0]['score'], 3) } # Step 4: Topics if "topics" in analysis_types: if progress_callback: progress_callback(4, "Identifying topics...") topics = self.zero_shot( content[:512], candidate_labels=[ "technology", "science", "business", "politics", "entertainment", "education", "health", "sports" ] ) results["topics"] = [ {"label": label, "score": round(score, 3)} for label, score in zip(topics['labels'], topics['scores']) if score > 0.1 ] return results except Exception as e: error_msg = f"Analysis error: {str(e)}" print("[DEBUG]", error_msg) return {"error": error_msg} def create_interface(): analyzer = ContentAnalyzer() with gr.Blocks(title="Content Analyzer") as demo: gr.Markdown("# 📑 Content Analyzer") gr.Markdown("Analyze text content from various sources using AI.") with gr.Tabs(): # Text Input Tab with gr.Tab("Text Input"): text_input = gr.Textbox( label="Enter Text", placeholder="Paste your text here...", lines=5 ) # URL Input Tab with gr.Tab("Web URL"): url_input = gr.Textbox( label="Enter URL", placeholder="https://example.com" ) # File Upload Tab with gr.Tab("File Upload"): file_input = gr.File( label="Upload File", file_types=[".txt", ".pdf", ".docx"] ) # Analysis Options analysis_types = gr.CheckboxGroup( choices=["summarize", "sentiment", "topics"], value=["summarize"], label="Analysis Types" ) analyze_btn = gr.Button("Analyze", variant="primary") # Output Sections with gr.Tabs(): with gr.Tab("Original Text"): original_text = gr.Markdown() with gr.Tab("Summary"): summary_output = gr.Markdown() with gr.Tab("Sentiment"): sentiment_output = gr.Markdown() with gr.Tab("Topics"): topics_output = gr.Markdown() def process_analysis(text, url, file, types, progress=gr.Progress()): """ This function is wrapped by gradio to handle user inputs. We use progress to show step-by-step updates. """ steps_total = 4 # We have up to 4 possible steps def progress_callback(step, desc): progress((step, desc), total=steps_total) results = analyzer.analyze_content( text=text, url=url, file=file, analysis_types=types, progress_callback=progress_callback ) # If there's an error, show it in "Original Text" tab for clarity if "error" in results: return results["error"], "", "", "" # Format outputs original = results.get("original_text", "") summary = results.get("summary", "") sentiment = "" if "sentiment" in results: sent = results["sentiment"] sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})" topics = "" if "topics" in results: topics_list = "\n".join([ f"- {t['label']}: {t['score']}" for t in results["topics"] ]) topics = "**Detected Topics:**\n" + topics_list return original, summary, sentiment, topics analyze_btn.click( fn=process_analysis, inputs=[text_input, url_input, file_input, analysis_types], outputs=[original_text, summary_output, sentiment_output, topics_output], show_progress=True # Enable the progress bar in Gradio ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()