camparchimedes commited on
Commit
04f2c63
·
verified ·
1 Parent(s): 3a0e2ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -188
app.py CHANGED
@@ -37,17 +37,18 @@ from fpdf import FPDF
37
  import psutil
38
  from gpuinfo import GPUInfo
39
 
40
- import numpy as np
41
  import torch
42
- import torchaudio
43
- import torchaudio.transforms as transforms
44
 
45
- from transformers import pipeline, AutoModel
 
 
 
 
 
46
 
47
- import spacy
48
- import networkx as nx
49
- from sklearn.feature_extraction.text import TfidfVectorizer
50
- from sklearn.metrics.pairwise import cosine_similarity
51
  warnings.filterwarnings("ignore")
52
 
53
  # ------------header section------------
@@ -107,124 +108,6 @@ def transcribe(microphone, file_upload, batch_size=15):
107
 
108
  return warn_output + text.strip(), system_info
109
 
110
- # ------------summary section------------
111
-
112
- # ------------for app integration later------------
113
-
114
- nlp = spacy.blank("nb") # codename 'nb' = Norwegian Bokmål
115
- nlp.add_pipe('sentencizer')
116
- spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
117
-
118
- summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
119
- # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large")
120
-
121
- @spaces.GPU()
122
- def clean_text(text):
123
- text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
124
- text = re.sub(r'[^\w\s]', '', text)
125
- text = re.sub(r'\s+', ' ', text).strip()
126
- return text
127
-
128
- @spaces.GPU()
129
- def preprocess_text(text, file_upload):
130
-
131
- if (text is not None) and (file_upload is None):
132
- doc = nlp(text)
133
-
134
- elif (text is None) and (file_upload is not None):
135
- doc = nlp(file_upload)
136
-
137
- stop_words = spacy_stop_words
138
- words = [token.text for token in doc if token.text.lower() not in stop_words]
139
- return ' '.join(words)
140
-
141
- @spaces.GPU()
142
- def summarize_text(text, file_upload):
143
- #
144
- # ----add same if/elif logic as above here----
145
- #
146
- preprocessed_text = preprocess_text(text, file_upload)
147
- inputs = summarization_model(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
148
- inputs = inputs.to(device)
149
- summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
150
- return summarization_model.decode(summary_ids[0], skip_special_tokens=True)
151
-
152
- @spaces.GPU()
153
- def build_similarity_matrix(sentences):
154
- similarity_matrix = nx.Graph()
155
- for i, tokens_a in enumerate(sentences):
156
- for j, tokens_b in enumerate(sentences):
157
- if i != j:
158
- common_words = set(tokens_a) & set(tokens_b)
159
- similarity_matrix.add_edge(i, j, weight=len(common_words))
160
- return similarity_matrix
161
-
162
- # PageRank
163
- @spaces.GPU()
164
- def graph_based_summary(text, file_upload, num_paragraphs=3):
165
- #
166
- # ----add same if/elif logic as above here----
167
- #
168
- sentences = [sent.text for sent in doc.sents]
169
- if len(sentences) < num_paragraphs:
170
- return ' '.join(sentences)
171
-
172
- sentence_tokens = [nlp(sent) for sent in sentences]
173
- stop_words = spacy_stop_words
174
- filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
175
- similarity_matrix = build_similarity_matrix(filtered_tokens)
176
-
177
- scores = nx.pagerank(similarity_matrix)
178
- ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
179
- return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
180
-
181
- @spaces.GPU()
182
- def lex_rank_summary(text, file_upload, num_paragraphs=3, threshold=0.1):
183
-
184
- if (text is not None) and (file_upload is None):
185
- doc = nlp(text)
186
-
187
- elif (text is None) and (file_upload is not None):
188
- doc = nlp(file_upload)
189
-
190
- sentences = [sent.text for sent in doc.sents]
191
- if len(sentences) < num_paragraphs:
192
- return ' '.join(sentences)
193
-
194
- stop_words = spacy_stop_words
195
- vectorizer = TfidfVectorizer(stop_words=list(stop_words))
196
- X = vectorizer.fit_transform(sentences)
197
- similarity_matrix = cosine_similarity(X, X)
198
-
199
- # Apply threshold@similarity matrix
200
- similarity_matrix[similarity_matrix < threshold] = 0
201
- nx_graph = nx.from_numpy_array(similarity_matrix)
202
- scores = nx.pagerank(nx_graph)
203
- ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
204
- return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
205
-
206
- @spaces.GPU()
207
- def text_rank_summary(text, file_upload, num_paragraphs=3):
208
-
209
- if (text is not None) and (file_upload is not None):
210
- doc = nlp(text)
211
-
212
- elif (text is None) and (file_upload is not None):
213
- doc = nlp(file_upload)
214
-
215
- sentences = [sent.text for sent in doc.sents]
216
- if len(sentences) < num_paragraphs:
217
- return ' '.join(sentences)
218
-
219
- stop_words = spacy_stop_words
220
- vectorizer = TfidfVectorizer(stop_words=list(stop_words))
221
- X = vectorizer.fit_transform(sentences)
222
- similarity_matrix = cosine_similarity(X, X)
223
-
224
- nx_graph = nx.from_numpy_array(similarity_matrix)
225
- scores = nx.pagerank(nx_graph)
226
- ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
227
- return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
228
 
229
  def save_to_pdf(text, summary):
230
  pdf = FPDF()
@@ -245,6 +128,7 @@ def save_to_pdf(text, summary):
245
  pdf.output(pdf_output_path)
246
  return pdf_output_path
247
 
 
248
  iface = gr.Blocks()
249
 
250
  with iface:
@@ -262,71 +146,12 @@ with iface:
262
  transcribe_btn = gr.Button("Transcribe Interview")
263
  text_output = gr.Textbox()
264
  system_info = gr.Textbox(label="System Info")
265
-
266
- # Corrected the order of arguments here to prevent the SyntaxError
267
  transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
268
 
269
  with gr.Tabs():
270
-
271
- with gr.TabItem("Summary | PageRank"):
272
- text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
273
- summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
274
-
275
- gr.Markdown("""
276
- **token-based**: similarity matrix edge weights representing token overlap/
277
- ranked by their centrality in the graph (good with dense inter-sentence relationships)
278
- """)
279
- gr.Markdown("""
280
- *Bjørn*: **gir sammendrag som fanger opp de mest relevante setninger i teksten**
281
- """)
282
-
283
- summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
284
- summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text, None), inputs=[text_input_graph], outputs=[summary_output_graph])
285
- summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
286
- summarize_uploaded_button_graph.click(fn=graph_based_summary, inputs=[None, upload], outputs=[summary_output_graph])
287
-
288
- with gr.TabItem("Summary | LexRank"):
289
- text_output = gr.Textbox(label="Transcription Output")
290
- text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
291
- summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
292
-
293
- gr.Markdown("""
294
- **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
295
- (good for sparse graph structures with thresholding)
296
- """)
297
- gr.Markdown("""
298
- *Bjørn*: **gir sammendrag som best fanger opp betydningen av hele teksten**
299
- """)
300
-
301
- summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
302
- summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text, None), inputs=[text_input_lex], outputs=[summary_output_lex])
303
- summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
304
- summarize_uploaded_button_lex.click(fn=lex_rank_summary, inputs=[None, upload], outputs=[summary_output_lex])
305
-
306
- with gr.TabItem("Summary | TextRank"):
307
- text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
308
- summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
309
-
310
- gr.Markdown("""
311
- **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
312
- """)
313
- gr.Markdown("""
314
- *Bjørn*: **sammendrag basert på i de setningene som ligner mest på hverandre fra teksten**
315
- """)
316
-
317
- summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
318
- summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text, None), inputs=[text_input_text_rank], outputs=[summary_output_text_rank])
319
- summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
320
- summarize_uploaded_button_text_rank.click(fn=text_rank_summary, inputs=[None, upload], outputs=[summary_output_text_rank])
321
-
322
  with gr.TabItem("Download PDF"):
323
- pdf_text_only = gr.Button("Download PDF with Transcribed Text Only")
324
- pdf_summary_only = gr.Button("Download PDF with Summary-of-Transcribed-Text Only")
325
- pdf_both = gr.Button("Download PDF with Both")
326
-
327
  pdf_output = gr.File(label="Download PDF")
328
 
329
- pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
330
- pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
331
- pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output])
332
-
 
37
  import psutil
38
  from gpuinfo import GPUInfo
39
 
40
+ #import numpy as np
41
  import torch
42
+ #import torchaudio
43
+ #import torchaudio.transforms as transforms
44
 
45
+ from transformers import pipeline #AutoModel
46
+
47
+ #import spacy
48
+ #import networkx as nx
49
+ #from sklearn.feature_extraction.text import TfidfVectorizer
50
+ #from sklearn.metrics.pairwise import cosine_similarity
51
 
 
 
 
 
52
  warnings.filterwarnings("ignore")
53
 
54
  # ------------header section------------
 
108
 
109
  return warn_output + text.strip(), system_info
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  def save_to_pdf(text, summary):
113
  pdf = FPDF()
 
128
  pdf.output(pdf_output_path)
129
  return pdf_output_path
130
 
131
+
132
  iface = gr.Blocks()
133
 
134
  with iface:
 
146
  transcribe_btn = gr.Button("Transcribe Interview")
147
  text_output = gr.Textbox()
148
  system_info = gr.Textbox(label="System Info")
149
+
 
150
  transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
151
 
152
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  with gr.TabItem("Download PDF"):
154
+ pdf_text_only = gr.Button("Download PDF with Transcribed Text")
 
 
 
155
  pdf_output = gr.File(label="Download PDF")
156
 
157
+ pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])