Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import time | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
import PyPDF2 | |
import docx | |
import os | |
from typing import List, Optional | |
class ContentAnalyzer: | |
def __init__(self): | |
print("[DEBUG] Initializing pipelines...") | |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
self.sentiment_analyzer = pipeline("sentiment-analysis") | |
self.zero_shot = pipeline("zero-shot-classification") | |
print("[DEBUG] Pipelines initialized.") | |
def read_file(self, file_obj) -> str: | |
"""Read content from different file types.""" | |
if file_obj is None: | |
return "" | |
file_ext = os.path.splitext(file_obj.name)[1].lower() | |
print(f"[DEBUG] File extension: {file_ext}") | |
try: | |
if file_ext == '.txt': | |
return file_obj.read().decode('utf-8') | |
elif file_ext == '.pdf': | |
pdf_reader = PyPDF2.PdfReader(file_obj) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
elif file_ext == '.docx': | |
doc = docx.Document(file_obj) | |
return "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
else: | |
return f"Unsupported file type: {file_ext}" | |
except Exception as e: | |
return f"Error reading file: {str(e)}" | |
def fetch_web_content(self, url: str) -> str: | |
"""Fetch content from URL.""" | |
print(f"[DEBUG] Attempting to fetch URL: {url}") | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove scripts and styles | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text(separator='\n') | |
lines = (line.strip() for line in text.splitlines()) | |
final_text = "\n".join(line for line in lines if line) | |
return final_text | |
except Exception as e: | |
return f"Error fetching URL: {str(e)}" | |
def analyze_content( | |
self, | |
content: str, | |
analysis_types: List[str], | |
) -> dict: | |
"""Perform summarization, sentiment analysis, and topic detection on `content`.""" | |
results = {} | |
truncated = content[:1000] + "..." if len(content) > 1000 else content | |
results["original_text"] = truncated | |
# Summarize | |
if "summarize" in analysis_types: | |
summary = self.summarizer(content[:1024], max_length=130, min_length=30) | |
results["summary"] = summary[0]['summary_text'] | |
# Sentiment | |
if "sentiment" in analysis_types: | |
sentiment = self.sentiment_analyzer(content[:512]) | |
results["sentiment"] = { | |
"label": sentiment[0]['label'], | |
"score": round(sentiment[0]['score'], 3) | |
} | |
# Topics | |
if "topics" in analysis_types: | |
topics = self.zero_shot( | |
content[:512], | |
candidate_labels=[ | |
"technology", "science", "business", "politics", | |
"entertainment", "education", "health", "sports" | |
] | |
) | |
results["topics"] = [ | |
{"label": label, "score": round(score, 3)} | |
for label, score in zip(topics['labels'], topics['scores']) | |
if score > 0.1 | |
] | |
return results | |
def create_interface(): | |
analyzer = ContentAnalyzer() | |
with gr.Blocks(title="Content Analyzer") as demo: | |
gr.Markdown("# π Content Analyzer") | |
gr.Markdown( | |
"Analyze text from **Text**, **URL**, or **File** with summarization, " | |
"sentiment, and topic detection. A progress bar will appear during processing." | |
) | |
# Dropdown for input type | |
input_choice = gr.Dropdown( | |
choices=["Text", "URL", "File"], | |
value="Text", | |
label="Select Input Type" | |
) | |
# We use three separate columns to conditionally display | |
with gr.Column(visible=True) as text_col: | |
text_input = gr.Textbox( | |
label="Enter Text", | |
placeholder="Paste your text here...", | |
lines=5 | |
) | |
with gr.Column(visible=False) as url_col: | |
url_input = gr.Textbox( | |
label="Enter URL", | |
placeholder="https://example.com" | |
) | |
with gr.Column(visible=False) as file_col: | |
file_input = gr.File( | |
label="Upload File", | |
file_types=[".txt", ".pdf", ".docx"] | |
) | |
def show_inputs(choice): | |
"""Return a dict mapping columns to booleans for visibility.""" | |
return { | |
text_col: choice == "Text", | |
url_col: choice == "URL", | |
file_col: choice == "File" | |
} | |
input_choice.change( | |
fn=show_inputs, | |
inputs=[input_choice], | |
outputs=[text_col, url_col, file_col] | |
) | |
analysis_types = gr.CheckboxGroup( | |
choices=["summarize", "sentiment", "topics"], | |
value=["summarize"], | |
label="Analysis Types" | |
) | |
analyze_btn = gr.Button("Analyze", variant="primary") | |
# Output tabs | |
with gr.Tabs(): | |
with gr.Tab("Original Text"): | |
original_text = gr.Markdown() | |
with gr.Tab("Summary"): | |
summary_output = gr.Markdown() | |
with gr.Tab("Sentiment"): | |
sentiment_output = gr.Markdown() | |
with gr.Tab("Topics"): | |
topics_output = gr.Markdown() | |
def process_analysis(choice, text_val, url_val, file_val, types): | |
""" | |
This function does everything in one place using a 'with gr.Progress() as p:' block, | |
so we can show each step of the process. We add time.sleep(1) just to demonstrate | |
the progress bar (otherwise it may appear/disappear too quickly). | |
""" | |
with gr.Progress() as p: | |
# STEP 1: Retrieve content | |
p(0, total=4, desc="Reading input") | |
time.sleep(1) # For demonstration | |
if choice == "Text": | |
content = text_val or "" | |
elif choice == "URL": | |
content = analyzer.fetch_web_content(url_val or "") | |
else: # File | |
content = analyzer.read_file(file_val) | |
if not content or content.startswith("Error"): | |
return content or "No content provided", "", "", "" | |
# STEP 2: Summarize | |
p(1, total=4, desc="Summarizing content") | |
time.sleep(1) # For demonstration | |
# STEP 3: Sentiment | |
p(2, total=4, desc="Performing sentiment analysis") | |
time.sleep(1) # For demonstration | |
# STEP 4: Topics | |
p(3, total=4, desc="Identifying topics") | |
time.sleep(1) # For demonstration | |
# After the progress steps, do the actual analysis in one shot | |
# (You could interleave the calls to pipeline with each progress step | |
# if you want real-time progress. This is a simplified approach.) | |
results = analyzer.analyze_content(content, types) | |
if "error" in results: | |
return results["error"], "", "", "" | |
original = results.get("original_text", "") | |
summary = results.get("summary", "") | |
sentiment = "" | |
if "sentiment" in results: | |
s = results["sentiment"] | |
sentiment = f"**Sentiment:** {s['label']} (Confidence: {s['score']})" | |
topics = "" | |
if "topics" in results: | |
t_list = "\n".join([ | |
f"- {t['label']}: {t['score']}" | |
for t in results["topics"] | |
]) | |
topics = "**Detected Topics:**\n" + t_list | |
return original, summary, sentiment, topics | |
analyze_btn.click( | |
fn=process_analysis, | |
inputs=[input_choice, text_input, url_input, file_input, analysis_types], | |
outputs=[original_text, summary_output, sentiment_output, topics_output], | |
show_progress=True # This ensures the Gradio progress bar is enabled | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() | |