MHamdan commited on
Commit
523e9ce
·
verified ·
1 Parent(s): 2af5feb

update app

Browse files
Files changed (1) hide show
  1. app.py +81 -73
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py
2
-
3
  import gradio as gr
4
  import requests
5
  from bs4 import BeautifulSoup
@@ -7,96 +5,62 @@ from transformers import pipeline
7
  import PyPDF2
8
  import docx
9
  import os
10
- import time
11
- from typing import List, Tuple, Optional
12
 
13
  class ContentAnalyzer:
14
  def __init__(self):
15
- print("[DEBUG] Initializing pipelines...")
16
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
  self.sentiment_analyzer = pipeline("sentiment-analysis")
18
  self.zero_shot = pipeline("zero-shot-classification")
19
- print("[DEBUG] Pipelines initialized.")
20
 
21
  def read_file(self, file_obj) -> str:
22
  """Read content from different file types."""
23
  if file_obj is None:
24
- print("[DEBUG] No file uploaded.")
25
  return ""
26
-
27
  file_ext = os.path.splitext(file_obj.name)[1].lower()
28
- print(f"[DEBUG] Uploaded file extension detected: {file_ext}")
29
-
30
  try:
31
  if file_ext == '.txt':
32
- content = file_obj.read().decode('utf-8')
33
- print("[DEBUG] Successfully read .txt file.")
34
- return content
35
-
36
  elif file_ext == '.pdf':
37
- # Note: For PyPDF2 >= 3.0.0, this usage is valid
38
  pdf_reader = PyPDF2.PdfReader(file_obj)
39
  text = ""
40
  for page in pdf_reader.pages:
41
  text += page.extract_text() + "\n"
42
- print("[DEBUG] Successfully read .pdf file.")
43
  return text
44
-
45
  elif file_ext == '.docx':
46
  doc = docx.Document(file_obj)
47
- paragraphs = [paragraph.text for paragraph in doc.paragraphs]
48
- print("[DEBUG] Successfully read .docx file.")
49
- return "\n".join(paragraphs)
50
-
51
  else:
52
- msg = f"Unsupported file type: {file_ext}"
53
- print("[DEBUG]", msg)
54
- return msg
55
-
56
  except Exception as e:
57
- error_msg = f"Error reading file: {str(e)}"
58
- print("[DEBUG]", error_msg)
59
- return error_msg
60
 
61
  def fetch_web_content(self, url: str) -> str:
62
  """Fetch content from URL."""
63
- print(f"[DEBUG] Attempting to fetch URL: {url}")
64
  try:
65
  response = requests.get(url, timeout=10)
66
  response.raise_for_status()
67
  soup = BeautifulSoup(response.text, 'html.parser')
68
-
69
- # Remove scripts and styles
70
  for script in soup(["script", "style"]):
71
  script.decompose()
72
-
73
  text = soup.get_text(separator='\n')
74
  lines = (line.strip() for line in text.splitlines())
75
- final_text = "\n".join(line for line in lines if line)
76
- print("[DEBUG] Successfully fetched and cleaned web content.")
77
- return final_text
78
-
79
  except Exception as e:
80
- error_msg = f"Error fetching URL: {str(e)}"
81
- print("[DEBUG]", error_msg)
82
- return error_msg
83
 
84
  def analyze_content(
85
- self,
86
  text: Optional[str] = None,
87
  url: Optional[str] = None,
88
  file: Optional[object] = None,
89
  analysis_types: List[str] = ["summarize"],
90
  progress_callback=None
91
  ) -> dict:
92
- """
93
- Analyze content from text, URL, or file.
94
- progress_callback is a function for updating progress steps.
95
- """
96
  try:
97
- # Step 1: Retrieve content
98
  if progress_callback:
99
- progress_callback(1, "Reading input...")
100
 
101
  if url:
102
  content = self.fetch_web_content(url)
@@ -108,31 +72,30 @@ class ContentAnalyzer:
108
  if not content or content.startswith("Error"):
109
  return {"error": content or "No content provided"}
110
 
111
- # Truncate for debug
112
  truncated = content[:1000] + "..." if len(content) > 1000 else content
113
  results = {"original_text": truncated}
114
 
115
- # Step 2: Summarize
116
  if "summarize" in analysis_types:
117
  if progress_callback:
118
- progress_callback(2, "Summarizing content...")
119
  summary = self.summarizer(content[:1024], max_length=130, min_length=30)
120
  results["summary"] = summary[0]['summary_text']
121
 
122
- # Step 3: Sentiment
123
  if "sentiment" in analysis_types:
124
  if progress_callback:
125
- progress_callback(3, "Performing sentiment analysis...")
126
  sentiment = self.sentiment_analyzer(content[:512])
127
  results["sentiment"] = {
128
  "label": sentiment[0]['label'],
129
  "score": round(sentiment[0]['score'], 3)
130
  }
131
 
132
- # Step 4: Topics
133
  if "topics" in analysis_types:
134
  if progress_callback:
135
- progress_callback(4, "Identifying topics...")
136
  topics = self.zero_shot(
137
  content[:512],
138
  candidate_labels=[
@@ -149,9 +112,8 @@ class ContentAnalyzer:
149
  return results
150
 
151
  except Exception as e:
152
- error_msg = f"Analysis error: {str(e)}"
153
- print("[DEBUG]", error_msg)
154
- return {"error": error_msg}
155
 
156
  def create_interface():
157
  analyzer = ContentAnalyzer()
@@ -160,12 +122,47 @@ def create_interface():
160
  gr.Markdown("# 📑 Content Analyzer")
161
  gr.Markdown("Analyze text content from various sources using AI.")
162
 
163
- with gr.Tabs():
164
- # Tabs for Text Input, Web URL, File Upload...
165
- text_input = gr.Textbox(label="Enter Text", placeholder="Paste your text here...", lines=5)
166
- url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
167
- file_input = gr.File(label="Upload File", file_types=[".txt", ".pdf", ".docx"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
 
169
  analysis_types = gr.CheckboxGroup(
170
  choices=["summarize", "sentiment", "topics"],
171
  value=["summarize"],
@@ -174,6 +171,7 @@ def create_interface():
174
 
175
  analyze_btn = gr.Button("Analyze", variant="primary")
176
 
 
177
  with gr.Tabs():
178
  with gr.Tab("Original Text"):
179
  original_text = gr.Markdown()
@@ -184,22 +182,32 @@ def create_interface():
184
  with gr.Tab("Topics"):
185
  topics_output = gr.Markdown()
186
 
187
- def process_analysis(text, url, file, types, progress=gr.Progress()):
 
188
  steps_total = 4
189
 
190
  def progress_callback(step: int, desc: str):
191
- """
192
- step: integer step index (1 to steps_total)
193
- desc: a short description of the current step
194
- """
195
- # Pass the integer 'step' as iteration, and the string 'desc' as desc.
196
  progress(step, total=steps_total, desc=desc)
197
 
198
- # Call your analyzer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  results = analyzer.analyze_content(
200
- text=text,
201
- url=url,
202
- file=file,
203
  analysis_types=types,
204
  progress_callback=progress_callback
205
  )
@@ -224,7 +232,7 @@ def create_interface():
224
 
225
  analyze_btn.click(
226
  fn=process_analysis,
227
- inputs=[text_input, url_input, file_input, analysis_types],
228
  outputs=[original_text, summary_output, sentiment_output, topics_output],
229
  show_progress=True
230
  )
@@ -233,4 +241,4 @@ def create_interface():
233
 
234
  if __name__ == "__main__":
235
  demo = create_interface()
236
- demo.launch()
 
 
 
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
 
5
  import PyPDF2
6
  import docx
7
  import os
8
+ from typing import List, Optional
 
9
 
10
  class ContentAnalyzer:
11
  def __init__(self):
 
12
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
  self.sentiment_analyzer = pipeline("sentiment-analysis")
14
  self.zero_shot = pipeline("zero-shot-classification")
 
15
 
16
  def read_file(self, file_obj) -> str:
17
  """Read content from different file types."""
18
  if file_obj is None:
 
19
  return ""
 
20
  file_ext = os.path.splitext(file_obj.name)[1].lower()
 
 
21
  try:
22
  if file_ext == '.txt':
23
+ return file_obj.read().decode('utf-8')
 
 
 
24
  elif file_ext == '.pdf':
 
25
  pdf_reader = PyPDF2.PdfReader(file_obj)
26
  text = ""
27
  for page in pdf_reader.pages:
28
  text += page.extract_text() + "\n"
 
29
  return text
 
30
  elif file_ext == '.docx':
31
  doc = docx.Document(file_obj)
32
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
 
 
 
33
  else:
34
+ return f"Unsupported file type: {file_ext}"
 
 
 
35
  except Exception as e:
36
+ return f"Error reading file: {str(e)}"
 
 
37
 
38
  def fetch_web_content(self, url: str) -> str:
39
  """Fetch content from URL."""
 
40
  try:
41
  response = requests.get(url, timeout=10)
42
  response.raise_for_status()
43
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
44
  for script in soup(["script", "style"]):
45
  script.decompose()
 
46
  text = soup.get_text(separator='\n')
47
  lines = (line.strip() for line in text.splitlines())
48
+ return "\n".join(line for line in lines if line)
 
 
 
49
  except Exception as e:
50
+ return f"Error fetching URL: {str(e)}"
 
 
51
 
52
  def analyze_content(
53
+ self,
54
  text: Optional[str] = None,
55
  url: Optional[str] = None,
56
  file: Optional[object] = None,
57
  analysis_types: List[str] = ["summarize"],
58
  progress_callback=None
59
  ) -> dict:
 
 
 
 
60
  try:
61
+ # STEP 1: Retrieve content
62
  if progress_callback:
63
+ progress_callback(1, "Reading input")
64
 
65
  if url:
66
  content = self.fetch_web_content(url)
 
72
  if not content or content.startswith("Error"):
73
  return {"error": content or "No content provided"}
74
 
 
75
  truncated = content[:1000] + "..." if len(content) > 1000 else content
76
  results = {"original_text": truncated}
77
 
78
+ # STEP 2: Summarize
79
  if "summarize" in analysis_types:
80
  if progress_callback:
81
+ progress_callback(2, "Summarizing content")
82
  summary = self.summarizer(content[:1024], max_length=130, min_length=30)
83
  results["summary"] = summary[0]['summary_text']
84
 
85
+ # STEP 3: Sentiment
86
  if "sentiment" in analysis_types:
87
  if progress_callback:
88
+ progress_callback(3, "Performing sentiment analysis")
89
  sentiment = self.sentiment_analyzer(content[:512])
90
  results["sentiment"] = {
91
  "label": sentiment[0]['label'],
92
  "score": round(sentiment[0]['score'], 3)
93
  }
94
 
95
+ # STEP 4: Topics
96
  if "topics" in analysis_types:
97
  if progress_callback:
98
+ progress_callback(4, "Identifying topics")
99
  topics = self.zero_shot(
100
  content[:512],
101
  candidate_labels=[
 
112
  return results
113
 
114
  except Exception as e:
115
+ return {"error": f"Analysis error: {str(e)}"}
116
+
 
117
 
118
  def create_interface():
119
  analyzer = ContentAnalyzer()
 
122
  gr.Markdown("# 📑 Content Analyzer")
123
  gr.Markdown("Analyze text content from various sources using AI.")
124
 
125
+ # Dropdown to choose input type
126
+ input_choice = gr.Dropdown(
127
+ choices=["Text", "URL", "File"],
128
+ value="Text",
129
+ label="Select Input Type"
130
+ )
131
+
132
+ # Containers for each input type
133
+ with gr.Column(visible=True) as text_col:
134
+ text_input = gr.Textbox(
135
+ label="Enter Text",
136
+ placeholder="Paste your text here...",
137
+ lines=5
138
+ )
139
+ with gr.Column(visible=False) as url_col:
140
+ url_input = gr.Textbox(
141
+ label="Enter URL",
142
+ placeholder="https://example.com"
143
+ )
144
+ with gr.Column(visible=False) as file_col:
145
+ file_input = gr.File(
146
+ label="Upload File",
147
+ file_types=[".txt", ".pdf", ".docx"]
148
+ )
149
+
150
+ # Callback function to show/hide input columns
151
+ def show_inputs(choice):
152
+ return {
153
+ text_col: choice == "Text",
154
+ url_col: choice == "URL",
155
+ file_col: choice == "File"
156
+ }
157
+
158
+ # Trigger showing/hiding based on the dropdown choice
159
+ input_choice.change(
160
+ fn=show_inputs,
161
+ inputs=[input_choice],
162
+ outputs=[text_col, url_col, file_col]
163
+ )
164
 
165
+ # Analysis Options
166
  analysis_types = gr.CheckboxGroup(
167
  choices=["summarize", "sentiment", "topics"],
168
  value=["summarize"],
 
171
 
172
  analyze_btn = gr.Button("Analyze", variant="primary")
173
 
174
+ # Output Sections in tabs
175
  with gr.Tabs():
176
  with gr.Tab("Original Text"):
177
  original_text = gr.Markdown()
 
182
  with gr.Tab("Topics"):
183
  topics_output = gr.Markdown()
184
 
185
+ def process_analysis(choice, text, url, file, types, progress=gr.Progress()):
186
+ """Orchestrates analysis depending on input choice."""
187
  steps_total = 4
188
 
189
  def progress_callback(step: int, desc: str):
 
 
 
 
 
190
  progress(step, total=steps_total, desc=desc)
191
 
192
+ # Determine which content to pass based on the input choice
193
+ if choice == "Text":
194
+ content_text = text
195
+ content_url = None
196
+ content_file = None
197
+ elif choice == "URL":
198
+ content_text = None
199
+ content_url = url
200
+ content_file = None
201
+ else: # choice == "File"
202
+ content_text = None
203
+ content_url = None
204
+ content_file = file
205
+
206
+ # Perform analysis
207
  results = analyzer.analyze_content(
208
+ text=content_text,
209
+ url=content_url,
210
+ file=content_file,
211
  analysis_types=types,
212
  progress_callback=progress_callback
213
  )
 
232
 
233
  analyze_btn.click(
234
  fn=process_analysis,
235
+ inputs=[input_choice, text_input, url_input, file_input, analysis_types],
236
  outputs=[original_text, summary_output, sentiment_output, topics_output],
237
  show_progress=True
238
  )
 
241
 
242
  if __name__ == "__main__":
243
  demo = create_interface()
244
+ demo.launch()