Spaces:
Running
Running
app
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
|
|
3 |
from bs4 import BeautifulSoup
|
4 |
from transformers import pipeline
|
5 |
import PyPDF2
|
@@ -9,15 +10,18 @@ from typing import List, Optional
|
|
9 |
|
10 |
class ContentAnalyzer:
|
11 |
def __init__(self):
|
|
|
12 |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
13 |
self.sentiment_analyzer = pipeline("sentiment-analysis")
|
14 |
self.zero_shot = pipeline("zero-shot-classification")
|
|
|
15 |
|
16 |
def read_file(self, file_obj) -> str:
|
17 |
"""Read content from different file types."""
|
18 |
if file_obj is None:
|
19 |
return ""
|
20 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
|
|
21 |
try:
|
22 |
if file_ext == '.txt':
|
23 |
return file_obj.read().decode('utf-8')
|
@@ -37,82 +41,60 @@ class ContentAnalyzer:
|
|
37 |
|
38 |
def fetch_web_content(self, url: str) -> str:
|
39 |
"""Fetch content from URL."""
|
|
|
40 |
try:
|
41 |
response = requests.get(url, timeout=10)
|
42 |
response.raise_for_status()
|
43 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
44 |
for script in soup(["script", "style"]):
|
45 |
script.decompose()
|
46 |
text = soup.get_text(separator='\n')
|
47 |
lines = (line.strip() for line in text.splitlines())
|
48 |
-
|
|
|
49 |
except Exception as e:
|
50 |
return f"Error fetching URL: {str(e)}"
|
51 |
|
52 |
def analyze_content(
|
53 |
self,
|
54 |
-
|
55 |
-
|
56 |
-
file: Optional[object] = None,
|
57 |
-
analysis_types: List[str] = ["summarize"],
|
58 |
-
progress_callback=None
|
59 |
) -> dict:
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
# STEP 3: Sentiment
|
86 |
-
if "sentiment" in analysis_types:
|
87 |
-
if progress_callback:
|
88 |
-
progress_callback(3, "Performing sentiment analysis")
|
89 |
-
sentiment = self.sentiment_analyzer(content[:512])
|
90 |
-
results["sentiment"] = {
|
91 |
-
"label": sentiment[0]['label'],
|
92 |
-
"score": round(sentiment[0]['score'], 3)
|
93 |
-
}
|
94 |
-
|
95 |
-
# STEP 4: Topics
|
96 |
-
if "topics" in analysis_types:
|
97 |
-
if progress_callback:
|
98 |
-
progress_callback(4, "Identifying topics")
|
99 |
-
topics = self.zero_shot(
|
100 |
-
content[:512],
|
101 |
-
candidate_labels=[
|
102 |
-
"technology", "science", "business", "politics",
|
103 |
-
"entertainment", "education", "health", "sports"
|
104 |
-
]
|
105 |
-
)
|
106 |
-
results["topics"] = [
|
107 |
-
{"label": label, "score": round(score, 3)}
|
108 |
-
for label, score in zip(topics['labels'], topics['scores'])
|
109 |
-
if score > 0.1
|
110 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
except Exception as e:
|
115 |
-
return {"error": f"Analysis error: {str(e)}"}
|
116 |
|
117 |
|
118 |
def create_interface():
|
@@ -120,49 +102,52 @@ def create_interface():
|
|
120 |
|
121 |
with gr.Blocks(title="Content Analyzer") as demo:
|
122 |
gr.Markdown("# 📑 Content Analyzer")
|
123 |
-
gr.Markdown(
|
|
|
|
|
|
|
124 |
|
125 |
-
# Dropdown
|
126 |
input_choice = gr.Dropdown(
|
127 |
choices=["Text", "URL", "File"],
|
128 |
value="Text",
|
129 |
label="Select Input Type"
|
130 |
)
|
131 |
|
132 |
-
#
|
133 |
with gr.Column(visible=True) as text_col:
|
134 |
text_input = gr.Textbox(
|
135 |
label="Enter Text",
|
136 |
placeholder="Paste your text here...",
|
137 |
lines=5
|
138 |
)
|
|
|
139 |
with gr.Column(visible=False) as url_col:
|
140 |
url_input = gr.Textbox(
|
141 |
label="Enter URL",
|
142 |
placeholder="https://example.com"
|
143 |
)
|
|
|
144 |
with gr.Column(visible=False) as file_col:
|
145 |
file_input = gr.File(
|
146 |
label="Upload File",
|
147 |
file_types=[".txt", ".pdf", ".docx"]
|
148 |
)
|
149 |
|
150 |
-
# Callback function to show/hide input columns
|
151 |
def show_inputs(choice):
|
|
|
152 |
return {
|
153 |
text_col: choice == "Text",
|
154 |
url_col: choice == "URL",
|
155 |
file_col: choice == "File"
|
156 |
}
|
157 |
|
158 |
-
# Trigger showing/hiding based on the dropdown choice
|
159 |
input_choice.change(
|
160 |
fn=show_inputs,
|
161 |
inputs=[input_choice],
|
162 |
outputs=[text_col, url_col, file_col]
|
163 |
)
|
164 |
|
165 |
-
# Analysis Options
|
166 |
analysis_types = gr.CheckboxGroup(
|
167 |
choices=["summarize", "sentiment", "topics"],
|
168 |
value=["summarize"],
|
@@ -171,7 +156,7 @@ def create_interface():
|
|
171 |
|
172 |
analyze_btn = gr.Button("Analyze", variant="primary")
|
173 |
|
174 |
-
# Output
|
175 |
with gr.Tabs():
|
176 |
with gr.Tab("Original Text"):
|
177 |
original_text = gr.Markdown()
|
@@ -182,40 +167,46 @@ def create_interface():
|
|
182 |
with gr.Tab("Topics"):
|
183 |
topics_output = gr.Markdown()
|
184 |
|
185 |
-
def process_analysis(choice,
|
186 |
-
"""
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
if "error" in results:
|
216 |
return results["error"], "", "", ""
|
217 |
|
218 |
-
# Format outputs
|
219 |
original = results.get("original_text", "")
|
220 |
summary = results.get("summary", "")
|
221 |
sentiment = ""
|
@@ -225,7 +216,10 @@ def create_interface():
|
|
225 |
|
226 |
topics = ""
|
227 |
if "topics" in results:
|
228 |
-
t_list = "\n".join([
|
|
|
|
|
|
|
229 |
topics = "**Detected Topics:**\n" + t_list
|
230 |
|
231 |
return original, summary, sentiment, topics
|
@@ -234,7 +228,7 @@ def create_interface():
|
|
234 |
fn=process_analysis,
|
235 |
inputs=[input_choice, text_input, url_input, file_input, analysis_types],
|
236 |
outputs=[original_text, summary_output, sentiment_output, topics_output],
|
237 |
-
show_progress=True
|
238 |
)
|
239 |
|
240 |
return demo
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
+
import time
|
4 |
from bs4 import BeautifulSoup
|
5 |
from transformers import pipeline
|
6 |
import PyPDF2
|
|
|
10 |
|
11 |
class ContentAnalyzer:
|
12 |
def __init__(self):
|
13 |
+
print("[DEBUG] Initializing pipelines...")
|
14 |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
15 |
self.sentiment_analyzer = pipeline("sentiment-analysis")
|
16 |
self.zero_shot = pipeline("zero-shot-classification")
|
17 |
+
print("[DEBUG] Pipelines initialized.")
|
18 |
|
19 |
def read_file(self, file_obj) -> str:
|
20 |
"""Read content from different file types."""
|
21 |
if file_obj is None:
|
22 |
return ""
|
23 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
24 |
+
print(f"[DEBUG] File extension: {file_ext}")
|
25 |
try:
|
26 |
if file_ext == '.txt':
|
27 |
return file_obj.read().decode('utf-8')
|
|
|
41 |
|
42 |
def fetch_web_content(self, url: str) -> str:
|
43 |
"""Fetch content from URL."""
|
44 |
+
print(f"[DEBUG] Attempting to fetch URL: {url}")
|
45 |
try:
|
46 |
response = requests.get(url, timeout=10)
|
47 |
response.raise_for_status()
|
48 |
soup = BeautifulSoup(response.text, 'html.parser')
|
49 |
+
# Remove scripts and styles
|
50 |
for script in soup(["script", "style"]):
|
51 |
script.decompose()
|
52 |
text = soup.get_text(separator='\n')
|
53 |
lines = (line.strip() for line in text.splitlines())
|
54 |
+
final_text = "\n".join(line for line in lines if line)
|
55 |
+
return final_text
|
56 |
except Exception as e:
|
57 |
return f"Error fetching URL: {str(e)}"
|
58 |
|
59 |
def analyze_content(
|
60 |
self,
|
61 |
+
content: str,
|
62 |
+
analysis_types: List[str],
|
|
|
|
|
|
|
63 |
) -> dict:
|
64 |
+
"""Perform summarization, sentiment analysis, and topic detection on `content`."""
|
65 |
+
results = {}
|
66 |
+
truncated = content[:1000] + "..." if len(content) > 1000 else content
|
67 |
+
results["original_text"] = truncated
|
68 |
+
|
69 |
+
# Summarize
|
70 |
+
if "summarize" in analysis_types:
|
71 |
+
summary = self.summarizer(content[:1024], max_length=130, min_length=30)
|
72 |
+
results["summary"] = summary[0]['summary_text']
|
73 |
+
|
74 |
+
# Sentiment
|
75 |
+
if "sentiment" in analysis_types:
|
76 |
+
sentiment = self.sentiment_analyzer(content[:512])
|
77 |
+
results["sentiment"] = {
|
78 |
+
"label": sentiment[0]['label'],
|
79 |
+
"score": round(sentiment[0]['score'], 3)
|
80 |
+
}
|
81 |
+
|
82 |
+
# Topics
|
83 |
+
if "topics" in analysis_types:
|
84 |
+
topics = self.zero_shot(
|
85 |
+
content[:512],
|
86 |
+
candidate_labels=[
|
87 |
+
"technology", "science", "business", "politics",
|
88 |
+
"entertainment", "education", "health", "sports"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
]
|
90 |
+
)
|
91 |
+
results["topics"] = [
|
92 |
+
{"label": label, "score": round(score, 3)}
|
93 |
+
for label, score in zip(topics['labels'], topics['scores'])
|
94 |
+
if score > 0.1
|
95 |
+
]
|
96 |
|
97 |
+
return results
|
|
|
|
|
|
|
98 |
|
99 |
|
100 |
def create_interface():
|
|
|
102 |
|
103 |
with gr.Blocks(title="Content Analyzer") as demo:
|
104 |
gr.Markdown("# 📑 Content Analyzer")
|
105 |
+
gr.Markdown(
|
106 |
+
"Analyze text from **Text**, **URL**, or **File** with summarization, "
|
107 |
+
"sentiment, and topic detection. A progress bar will appear during processing."
|
108 |
+
)
|
109 |
|
110 |
+
# Dropdown for input type
|
111 |
input_choice = gr.Dropdown(
|
112 |
choices=["Text", "URL", "File"],
|
113 |
value="Text",
|
114 |
label="Select Input Type"
|
115 |
)
|
116 |
|
117 |
+
# We use three separate columns to conditionally display
|
118 |
with gr.Column(visible=True) as text_col:
|
119 |
text_input = gr.Textbox(
|
120 |
label="Enter Text",
|
121 |
placeholder="Paste your text here...",
|
122 |
lines=5
|
123 |
)
|
124 |
+
|
125 |
with gr.Column(visible=False) as url_col:
|
126 |
url_input = gr.Textbox(
|
127 |
label="Enter URL",
|
128 |
placeholder="https://example.com"
|
129 |
)
|
130 |
+
|
131 |
with gr.Column(visible=False) as file_col:
|
132 |
file_input = gr.File(
|
133 |
label="Upload File",
|
134 |
file_types=[".txt", ".pdf", ".docx"]
|
135 |
)
|
136 |
|
|
|
137 |
def show_inputs(choice):
|
138 |
+
"""Return a dict mapping columns to booleans for visibility."""
|
139 |
return {
|
140 |
text_col: choice == "Text",
|
141 |
url_col: choice == "URL",
|
142 |
file_col: choice == "File"
|
143 |
}
|
144 |
|
|
|
145 |
input_choice.change(
|
146 |
fn=show_inputs,
|
147 |
inputs=[input_choice],
|
148 |
outputs=[text_col, url_col, file_col]
|
149 |
)
|
150 |
|
|
|
151 |
analysis_types = gr.CheckboxGroup(
|
152 |
choices=["summarize", "sentiment", "topics"],
|
153 |
value=["summarize"],
|
|
|
156 |
|
157 |
analyze_btn = gr.Button("Analyze", variant="primary")
|
158 |
|
159 |
+
# Output tabs
|
160 |
with gr.Tabs():
|
161 |
with gr.Tab("Original Text"):
|
162 |
original_text = gr.Markdown()
|
|
|
167 |
with gr.Tab("Topics"):
|
168 |
topics_output = gr.Markdown()
|
169 |
|
170 |
+
def process_analysis(choice, text_val, url_val, file_val, types):
|
171 |
+
"""
|
172 |
+
This function does everything in one place using a 'with gr.Progress() as p:' block,
|
173 |
+
so we can show each step of the process. We add time.sleep(1) just to demonstrate
|
174 |
+
the progress bar (otherwise it may appear/disappear too quickly).
|
175 |
+
"""
|
176 |
+
with gr.Progress() as p:
|
177 |
+
# STEP 1: Retrieve content
|
178 |
+
p(0, total=4, desc="Reading input")
|
179 |
+
time.sleep(1) # For demonstration
|
180 |
+
if choice == "Text":
|
181 |
+
content = text_val or ""
|
182 |
+
elif choice == "URL":
|
183 |
+
content = analyzer.fetch_web_content(url_val or "")
|
184 |
+
else: # File
|
185 |
+
content = analyzer.read_file(file_val)
|
186 |
+
|
187 |
+
if not content or content.startswith("Error"):
|
188 |
+
return content or "No content provided", "", "", ""
|
189 |
+
|
190 |
+
# STEP 2: Summarize
|
191 |
+
p(1, total=4, desc="Summarizing content")
|
192 |
+
time.sleep(1) # For demonstration
|
193 |
+
|
194 |
+
# STEP 3: Sentiment
|
195 |
+
p(2, total=4, desc="Performing sentiment analysis")
|
196 |
+
time.sleep(1) # For demonstration
|
197 |
+
|
198 |
+
# STEP 4: Topics
|
199 |
+
p(3, total=4, desc="Identifying topics")
|
200 |
+
time.sleep(1) # For demonstration
|
201 |
+
|
202 |
+
# After the progress steps, do the actual analysis in one shot
|
203 |
+
# (You could interleave the calls to pipeline with each progress step
|
204 |
+
# if you want real-time progress. This is a simplified approach.)
|
205 |
+
results = analyzer.analyze_content(content, types)
|
206 |
|
207 |
if "error" in results:
|
208 |
return results["error"], "", "", ""
|
209 |
|
|
|
210 |
original = results.get("original_text", "")
|
211 |
summary = results.get("summary", "")
|
212 |
sentiment = ""
|
|
|
216 |
|
217 |
topics = ""
|
218 |
if "topics" in results:
|
219 |
+
t_list = "\n".join([
|
220 |
+
f"- {t['label']}: {t['score']}"
|
221 |
+
for t in results["topics"]
|
222 |
+
])
|
223 |
topics = "**Detected Topics:**\n" + t_list
|
224 |
|
225 |
return original, summary, sentiment, topics
|
|
|
228 |
fn=process_analysis,
|
229 |
inputs=[input_choice, text_input, url_input, file_input, analysis_types],
|
230 |
outputs=[original_text, summary_output, sentiment_output, topics_output],
|
231 |
+
show_progress=True # This ensures the Gradio progress bar is enabled
|
232 |
)
|
233 |
|
234 |
return demo
|