camparchimedes commited on
Commit
2fb8a5f
·
verified ·
1 Parent(s): 1bc35a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -18
app.py CHANGED
@@ -58,11 +58,12 @@ SIDEBAR_INFO = f"""
58
  </div>
59
  """
60
 
61
- # ------------transcribe section------------
 
62
 
63
- @spaces.GPU()
64
 
65
  # ============ORIGINAL============[convert m4a audio to wav]
 
66
  def convert_to_wav(audio_file):
67
  audio = AudioSegment.from_file(audio_file, format="m4a")
68
  wav_file = "temp.wav"
@@ -79,19 +80,19 @@ def transcribe_audio(audio_file, batch_size=16):
79
  audio_file = convert_to_wav(audio_file)
80
 
81
  with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
82
- # Copy the contents of the uploaded audio file to the temporary file
83
  temp_audio_file.write(open(audio_file, "rb").read())
84
  temp_audio_file.flush()
85
- # Load the audio file using torchaudio
86
  waveform, sample_rate = torchaudio.load(temp_audio_file.name)
87
- # Resample the audio to 16kHz
88
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
89
  waveform = resampler(waveform)
90
 
91
  # --convert to mono
92
  if waveform.ndim > 1:
93
  waveform = waveform[0, :]
94
- # Convert PyTorch tensor NumPy ndarray
95
  waveform = waveform.numpy()
96
 
97
  start_time = time.time()
@@ -126,35 +127,36 @@ def transcribe_audio(audio_file, batch_size=16):
126
  return text.strip(), system_info
127
 
128
 
129
- # ------------summary section------------
130
 
131
-
132
- # ------------for app integration later------------
133
 
134
  @spaces.GPU()
 
135
  def clean_text(text):
136
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
137
  text = re.sub(r'[^\w\s]', '', text)
138
  text = re.sub(r'\s+', ' ', text).strip()
139
  return text
140
 
141
- nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
 
142
  nlp.add_pipe('sentencizer')
143
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
144
 
 
145
  summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
146
- # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large")
147
 
 
148
  @spaces.GPU()
149
  def preprocess_text(text):
150
- # Process the text with SpaCy
151
  doc = nlp(text)
152
- # SpaCy's stop top wrds direct
153
  stop_words = spacy_stop_words
154
- # Filter out stop words
155
  words = [token.text for token in doc if token.text.lower() not in stop_words]
156
  return ' '.join(words)
157
 
 
158
  @spaces.GPU()
159
  def summarize_text(text):
160
  preprocessed_text = preprocess_text(text)
@@ -172,8 +174,10 @@ def build_similarity_matrix(sentences, stop_words):
172
  common_words = set(tokens_a) & set(tokens_b)
173
  similarity_matrix.add_edge(i, j, weight=len(common_words))
174
  return similarity_matrix
 
 
175
 
176
- # PageRank
177
  @spaces.GPU()
178
  def graph_based_summary(text, num_paragraphs=3):
179
  doc = nlp(text)
@@ -190,7 +194,7 @@ def graph_based_summary(text, num_paragraphs=3):
190
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
191
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
192
 
193
- # LexRank
194
  @spaces.GPU()
195
  def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
196
  doc = nlp(text)
@@ -210,7 +214,7 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
210
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
211
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
212
 
213
- # TextRank
214
  @spaces.GPU()
215
  def text_rank_summary(text, num_paragraphs=3):
216
  doc = nlp(text)
@@ -228,8 +232,11 @@ def text_rank_summary(text, num_paragraphs=3):
228
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
229
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
230
 
231
- iface = gr.Blocks()
232
 
 
 
 
233
  with iface:
234
  gr.HTML(SIDEBAR_INFO)
235
  gr.Markdown(HEADER_INFO)
 
58
  </div>
59
  """
60
 
61
+ # ------------transcribe section------------
62
+
63
 
 
64
 
65
  # ============ORIGINAL============[convert m4a audio to wav]
66
+ @spaces.GPU()
67
  def convert_to_wav(audio_file):
68
  audio = AudioSegment.from_file(audio_file, format="m4a")
69
  wav_file = "temp.wav"
 
80
  audio_file = convert_to_wav(audio_file)
81
 
82
  with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
83
+ # --copy contents of uploaded audio file to temporary file
84
  temp_audio_file.write(open(audio_file, "rb").read())
85
  temp_audio_file.flush()
86
+ # --use torchaudio to load it
87
  waveform, sample_rate = torchaudio.load(temp_audio_file.name)
88
+ # --resample to 16kHz
89
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
90
  waveform = resampler(waveform)
91
 
92
  # --convert to mono
93
  if waveform.ndim > 1:
94
  waveform = waveform[0, :]
95
+ # Convert tensor@ndarray
96
  waveform = waveform.numpy()
97
 
98
  start_time = time.time()
 
127
  return text.strip(), system_info
128
 
129
 
130
+ # ------------summaries section------------
131
 
132
+ # [------------for app integration later------------]
 
133
 
134
  @spaces.GPU()
135
+ # --btw, who is doing this...?
136
  def clean_text(text):
137
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
138
  text = re.sub(r'[^\w\s]', '', text)
139
  text = re.sub(r'\s+', ' ', text).strip()
140
  return text
141
 
142
+ # --SpaCy params
143
+ nlp = spacy.blank("nb") # ---==> codename ("norsk bokmål")
144
  nlp.add_pipe('sentencizer')
145
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
146
 
147
+ # --model (has tokenizer?)
148
  summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
149
+ # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large") -----hm..
150
 
151
+ # --process text with SpaCy
152
  @spaces.GPU()
153
  def preprocess_text(text):
 
154
  doc = nlp(text)
 
155
  stop_words = spacy_stop_words
 
156
  words = [token.text for token in doc if token.text.lower() not in stop_words]
157
  return ' '.join(words)
158
 
159
+ # --model is called to summarize (need to be placed *after* the three styles and call them)
160
  @spaces.GPU()
161
  def summarize_text(text):
162
  preprocessed_text = preprocess_text(text)
 
174
  common_words = set(tokens_a) & set(tokens_b)
175
  similarity_matrix.add_edge(i, j, weight=len(common_words))
176
  return similarity_matrix
177
+ # [------------model needs to be called for these------------]
178
+
179
 
180
+ # --PageRank
181
  @spaces.GPU()
182
  def graph_based_summary(text, num_paragraphs=3):
183
  doc = nlp(text)
 
194
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
195
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
196
 
197
+ # --LexRank
198
  @spaces.GPU()
199
  def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
200
  doc = nlp(text)
 
214
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
215
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
216
 
217
+ # --TextRank
218
  @spaces.GPU()
219
  def text_rank_summary(text, num_paragraphs=3):
220
  doc = nlp(text)
 
232
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
233
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
234
 
235
+
236
 
237
+ # ------------interface section------------
238
+
239
+ iface = gr.Blocks()
240
  with iface:
241
  gr.HTML(SIDEBAR_INFO)
242
  gr.Markdown(HEADER_INFO)