MHamdan commited on
Commit
18d6761
·
verified ·
1 Parent(s): 5f0fd24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -55
app.py CHANGED
@@ -1,4 +1,5 @@
1
  # app.py
 
2
  import gradio as gr
3
  import requests
4
  from bs4 import BeautifulSoup
@@ -6,70 +7,97 @@ from transformers import pipeline
6
  import PyPDF2
7
  import docx
8
  import os
 
9
  from typing import List, Tuple, Optional
10
- from smolagents import CodeAgent, HfApiModel, Tool
11
 
12
  class ContentAnalyzer:
13
  def __init__(self):
14
- # Initialize models
15
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
16
  self.sentiment_analyzer = pipeline("sentiment-analysis")
17
  self.zero_shot = pipeline("zero-shot-classification")
18
-
 
19
  def read_file(self, file_obj) -> str:
20
  """Read content from different file types."""
21
  if file_obj is None:
 
22
  return ""
23
-
24
  file_ext = os.path.splitext(file_obj.name)[1].lower()
25
-
 
26
  try:
27
  if file_ext == '.txt':
28
- return file_obj.read().decode('utf-8')
29
-
 
 
30
  elif file_ext == '.pdf':
 
31
  pdf_reader = PyPDF2.PdfReader(file_obj)
32
  text = ""
33
  for page in pdf_reader.pages:
34
  text += page.extract_text() + "\n"
 
35
  return text
36
-
37
  elif file_ext == '.docx':
38
  doc = docx.Document(file_obj)
39
- return "\n".join([paragraph.text for paragraph in doc.paragraphs])
40
-
 
 
41
  else:
42
- return f"Unsupported file type: {file_ext}"
43
-
 
 
44
  except Exception as e:
45
- return f"Error reading file: {str(e)}"
 
 
46
 
47
  def fetch_web_content(self, url: str) -> str:
48
  """Fetch content from URL."""
 
49
  try:
50
  response = requests.get(url, timeout=10)
51
  response.raise_for_status()
52
  soup = BeautifulSoup(response.text, 'html.parser')
53
-
54
  # Remove scripts and styles
55
  for script in soup(["script", "style"]):
56
  script.decompose()
57
-
58
  text = soup.get_text(separator='\n')
59
  lines = (line.strip() for line in text.splitlines())
60
- return "\n".join(line for line in lines if line)
61
-
 
 
62
  except Exception as e:
63
- return f"Error fetching URL: {str(e)}"
64
-
65
- def analyze_content(self,
66
- text: Optional[str] = None,
67
- url: Optional[str] = None,
68
- file: Optional[object] = None,
69
- analysis_types: List[str] = ["summarize"]) -> dict:
70
- """Analyze content from text, URL, or file."""
 
 
 
 
 
 
 
 
71
  try:
72
- # Get content from appropriate source
 
 
 
73
  if url:
74
  content = self.fetch_web_content(url)
75
  elif file:
@@ -80,28 +108,37 @@ class ContentAnalyzer:
80
  if not content or content.startswith("Error"):
81
  return {"error": content or "No content provided"}
82
 
83
- results = {
84
- "original_text": content[:1000] + "..." if len(content) > 1000 else content
85
- }
86
 
87
- # Perform requested analyses
88
  if "summarize" in analysis_types:
 
 
89
  summary = self.summarizer(content[:1024], max_length=130, min_length=30)
90
  results["summary"] = summary[0]['summary_text']
91
 
 
92
  if "sentiment" in analysis_types:
 
 
93
  sentiment = self.sentiment_analyzer(content[:512])
94
  results["sentiment"] = {
95
  "label": sentiment[0]['label'],
96
  "score": round(sentiment[0]['score'], 3)
97
  }
98
 
 
99
  if "topics" in analysis_types:
 
 
100
  topics = self.zero_shot(
101
  content[:512],
102
- candidate_labels=["technology", "science", "business",
103
- "politics", "entertainment", "education",
104
- "health", "sports"]
 
105
  )
106
  results["topics"] = [
107
  {"label": label, "score": round(score, 3)}
@@ -112,15 +149,18 @@ class ContentAnalyzer:
112
  return results
113
 
114
  except Exception as e:
115
- return {"error": f"Analysis error: {str(e)}"}
 
 
 
116
 
117
  def create_interface():
118
  analyzer = ContentAnalyzer()
119
-
120
  with gr.Blocks(title="Content Analyzer") as demo:
121
  gr.Markdown("# 📑 Content Analyzer")
122
  gr.Markdown("Analyze text content from various sources using AI.")
123
-
124
  with gr.Tabs():
125
  # Text Input Tab
126
  with gr.Tab("Text Input"):
@@ -129,30 +169,30 @@ def create_interface():
129
  placeholder="Paste your text here...",
130
  lines=5
131
  )
132
-
133
  # URL Input Tab
134
  with gr.Tab("Web URL"):
135
  url_input = gr.Textbox(
136
  label="Enter URL",
137
  placeholder="https://example.com"
138
  )
139
-
140
  # File Upload Tab
141
  with gr.Tab("File Upload"):
142
  file_input = gr.File(
143
  label="Upload File",
144
  file_types=[".txt", ".pdf", ".docx"]
145
  )
146
-
147
  # Analysis Options
148
  analysis_types = gr.CheckboxGroup(
149
  choices=["summarize", "sentiment", "topics"],
150
  value=["summarize"],
151
  label="Analysis Types"
152
  )
153
-
154
  analyze_btn = gr.Button("Analyze", variant="primary")
155
-
156
  # Output Sections
157
  with gr.Tabs():
158
  with gr.Tab("Original Text"):
@@ -163,14 +203,29 @@ def create_interface():
163
  sentiment_output = gr.Markdown()
164
  with gr.Tab("Topics"):
165
  topics_output = gr.Markdown()
166
-
167
- def process_analysis(text, url, file, types):
168
- # Get analysis results
169
- results = analyzer.analyze_content(text, url, file, types)
 
 
 
 
 
 
170
 
 
 
 
 
 
 
 
 
 
171
  if "error" in results:
172
  return results["error"], "", "", ""
173
-
174
  # Format outputs
175
  original = results.get("original_text", "")
176
  summary = results.get("summary", "")
@@ -179,26 +234,26 @@ def create_interface():
179
  if "sentiment" in results:
180
  sent = results["sentiment"]
181
  sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"
182
-
183
  topics = ""
184
  if "topics" in results:
185
- topics = "**Detected Topics:**\n" + "\n".join([
186
  f"- {t['label']}: {t['score']}"
187
  for t in results["topics"]
188
  ])
189
-
 
190
  return original, summary, sentiment, topics
191
-
192
- # Connect the interface
193
  analyze_btn.click(
194
  fn=process_analysis,
195
  inputs=[text_input, url_input, file_input, analysis_types],
196
- outputs=[original_text, summary_output, sentiment_output, topics_output]
 
197
  )
198
-
199
  return demo
200
 
201
- # Launch the app
202
  if __name__ == "__main__":
203
  demo = create_interface()
204
- demo.launch()
 
1
  # app.py
2
+
3
  import gradio as gr
4
  import requests
5
  from bs4 import BeautifulSoup
 
7
  import PyPDF2
8
  import docx
9
  import os
10
+ import time
11
  from typing import List, Tuple, Optional
 
12
 
13
  class ContentAnalyzer:
14
  def __init__(self):
15
+ print("[DEBUG] Initializing pipelines...")
16
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
  self.sentiment_analyzer = pipeline("sentiment-analysis")
18
  self.zero_shot = pipeline("zero-shot-classification")
19
+ print("[DEBUG] Pipelines initialized.")
20
+
21
  def read_file(self, file_obj) -> str:
22
  """Read content from different file types."""
23
  if file_obj is None:
24
+ print("[DEBUG] No file uploaded.")
25
  return ""
26
+
27
  file_ext = os.path.splitext(file_obj.name)[1].lower()
28
+ print(f"[DEBUG] Uploaded file extension detected: {file_ext}")
29
+
30
  try:
31
  if file_ext == '.txt':
32
+ content = file_obj.read().decode('utf-8')
33
+ print("[DEBUG] Successfully read .txt file.")
34
+ return content
35
+
36
  elif file_ext == '.pdf':
37
+ # Note: For PyPDF2 >= 3.0.0, this usage is valid
38
  pdf_reader = PyPDF2.PdfReader(file_obj)
39
  text = ""
40
  for page in pdf_reader.pages:
41
  text += page.extract_text() + "\n"
42
+ print("[DEBUG] Successfully read .pdf file.")
43
  return text
44
+
45
  elif file_ext == '.docx':
46
  doc = docx.Document(file_obj)
47
+ paragraphs = [paragraph.text for paragraph in doc.paragraphs]
48
+ print("[DEBUG] Successfully read .docx file.")
49
+ return "\n".join(paragraphs)
50
+
51
  else:
52
+ msg = f"Unsupported file type: {file_ext}"
53
+ print("[DEBUG]", msg)
54
+ return msg
55
+
56
  except Exception as e:
57
+ error_msg = f"Error reading file: {str(e)}"
58
+ print("[DEBUG]", error_msg)
59
+ return error_msg
60
 
61
  def fetch_web_content(self, url: str) -> str:
62
  """Fetch content from URL."""
63
+ print(f"[DEBUG] Attempting to fetch URL: {url}")
64
  try:
65
  response = requests.get(url, timeout=10)
66
  response.raise_for_status()
67
  soup = BeautifulSoup(response.text, 'html.parser')
68
+
69
  # Remove scripts and styles
70
  for script in soup(["script", "style"]):
71
  script.decompose()
72
+
73
  text = soup.get_text(separator='\n')
74
  lines = (line.strip() for line in text.splitlines())
75
+ final_text = "\n".join(line for line in lines if line)
76
+ print("[DEBUG] Successfully fetched and cleaned web content.")
77
+ return final_text
78
+
79
  except Exception as e:
80
+ error_msg = f"Error fetching URL: {str(e)}"
81
+ print("[DEBUG]", error_msg)
82
+ return error_msg
83
+
84
+ def analyze_content(
85
+ self,
86
+ text: Optional[str] = None,
87
+ url: Optional[str] = None,
88
+ file: Optional[object] = None,
89
+ analysis_types: List[str] = ["summarize"],
90
+ progress_callback=None
91
+ ) -> dict:
92
+ """
93
+ Analyze content from text, URL, or file.
94
+ progress_callback is a function for updating progress steps.
95
+ """
96
  try:
97
+ # Step 1: Retrieve content
98
+ if progress_callback:
99
+ progress_callback(1, "Reading input...")
100
+
101
  if url:
102
  content = self.fetch_web_content(url)
103
  elif file:
 
108
  if not content or content.startswith("Error"):
109
  return {"error": content or "No content provided"}
110
 
111
+ # Truncate for debug
112
+ truncated = content[:1000] + "..." if len(content) > 1000 else content
113
+ results = {"original_text": truncated}
114
 
115
+ # Step 2: Summarize
116
  if "summarize" in analysis_types:
117
+ if progress_callback:
118
+ progress_callback(2, "Summarizing content...")
119
  summary = self.summarizer(content[:1024], max_length=130, min_length=30)
120
  results["summary"] = summary[0]['summary_text']
121
 
122
+ # Step 3: Sentiment
123
  if "sentiment" in analysis_types:
124
+ if progress_callback:
125
+ progress_callback(3, "Performing sentiment analysis...")
126
  sentiment = self.sentiment_analyzer(content[:512])
127
  results["sentiment"] = {
128
  "label": sentiment[0]['label'],
129
  "score": round(sentiment[0]['score'], 3)
130
  }
131
 
132
+ # Step 4: Topics
133
  if "topics" in analysis_types:
134
+ if progress_callback:
135
+ progress_callback(4, "Identifying topics...")
136
  topics = self.zero_shot(
137
  content[:512],
138
+ candidate_labels=[
139
+ "technology", "science", "business", "politics",
140
+ "entertainment", "education", "health", "sports"
141
+ ]
142
  )
143
  results["topics"] = [
144
  {"label": label, "score": round(score, 3)}
 
149
  return results
150
 
151
  except Exception as e:
152
+ error_msg = f"Analysis error: {str(e)}"
153
+ print("[DEBUG]", error_msg)
154
+ return {"error": error_msg}
155
+
156
 
157
  def create_interface():
158
  analyzer = ContentAnalyzer()
159
+
160
  with gr.Blocks(title="Content Analyzer") as demo:
161
  gr.Markdown("# 📑 Content Analyzer")
162
  gr.Markdown("Analyze text content from various sources using AI.")
163
+
164
  with gr.Tabs():
165
  # Text Input Tab
166
  with gr.Tab("Text Input"):
 
169
  placeholder="Paste your text here...",
170
  lines=5
171
  )
172
+
173
  # URL Input Tab
174
  with gr.Tab("Web URL"):
175
  url_input = gr.Textbox(
176
  label="Enter URL",
177
  placeholder="https://example.com"
178
  )
179
+
180
  # File Upload Tab
181
  with gr.Tab("File Upload"):
182
  file_input = gr.File(
183
  label="Upload File",
184
  file_types=[".txt", ".pdf", ".docx"]
185
  )
186
+
187
  # Analysis Options
188
  analysis_types = gr.CheckboxGroup(
189
  choices=["summarize", "sentiment", "topics"],
190
  value=["summarize"],
191
  label="Analysis Types"
192
  )
193
+
194
  analyze_btn = gr.Button("Analyze", variant="primary")
195
+
196
  # Output Sections
197
  with gr.Tabs():
198
  with gr.Tab("Original Text"):
 
203
  sentiment_output = gr.Markdown()
204
  with gr.Tab("Topics"):
205
  topics_output = gr.Markdown()
206
+
207
+ def process_analysis(text, url, file, types, progress=gr.Progress()):
208
+ """
209
+ This function is wrapped by gradio to handle user inputs.
210
+ We use progress to show step-by-step updates.
211
+ """
212
+ steps_total = 4 # We have up to 4 possible steps
213
+
214
+ def progress_callback(step, desc):
215
+ progress((step, desc), total=steps_total)
216
 
217
+ results = analyzer.analyze_content(
218
+ text=text,
219
+ url=url,
220
+ file=file,
221
+ analysis_types=types,
222
+ progress_callback=progress_callback
223
+ )
224
+
225
+ # If there's an error, show it in "Original Text" tab for clarity
226
  if "error" in results:
227
  return results["error"], "", "", ""
228
+
229
  # Format outputs
230
  original = results.get("original_text", "")
231
  summary = results.get("summary", "")
 
234
  if "sentiment" in results:
235
  sent = results["sentiment"]
236
  sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"
237
+
238
  topics = ""
239
  if "topics" in results:
240
+ topics_list = "\n".join([
241
  f"- {t['label']}: {t['score']}"
242
  for t in results["topics"]
243
  ])
244
+ topics = "**Detected Topics:**\n" + topics_list
245
+
246
  return original, summary, sentiment, topics
247
+
 
248
  analyze_btn.click(
249
  fn=process_analysis,
250
  inputs=[text_input, url_input, file_input, analysis_types],
251
+ outputs=[original_text, summary_output, sentiment_output, topics_output],
252
+ show_progress=True # Enable the progress bar in Gradio
253
  )
254
+
255
  return demo
256
 
 
257
  if __name__ == "__main__":
258
  demo = create_interface()
259
+ demo.launch()