Error Handling + Joe's Suggestion + FileNaming

#3
Files changed (1) hide show
  1. app.py +67 -41
app.py CHANGED
@@ -47,31 +47,24 @@ def load_model_and_predict(
47
  audio_in: str,
48
  model_state: dict,
49
  ):
50
- if audio_in is None:
51
- return (
52
- "",
53
- model_state,
54
- gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
55
- )
56
-
57
- if model_state["model_name"] != model_name:
58
- model_state = {
59
- "loaded_model": pipeline(
60
- task="automatic-speech-recognition", model=model_name
61
- ),
62
- "model_name": model_name,
63
- }
64
-
65
- prediction = model_state["loaded_model"](audio_in)["text"]
66
- return (
67
- prediction,
68
- model_state,
69
- gr.Textbox(
70
- label=TEXTGRID_NAME_INPUT_LABEL,
71
- interactive=True,
72
- value=Path(audio_in).with_suffix(".TextGrid").name,
73
- ),
74
- )
75
 
76
 
77
  def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
@@ -144,6 +137,34 @@ def extract_tier_names(textgrid_file):
144
  return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
145
  except Exception as e:
146
  return gr.update(choices=[], value=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
 
149
  def launch_demo():
@@ -154,10 +175,6 @@ def launch_demo():
154
  "model_name": DEFAULT_MODEL,
155
  }
156
 
157
- # Helper function - enables the interval transcribe button
158
- def enable_interval_transcribe_btn(audio, textgrid):
159
- return gr.update(interactive=(audio is not None and textgrid is not None))
160
-
161
  with gr.Blocks() as demo:
162
  gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
163
  This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
@@ -172,7 +189,7 @@ def launch_demo():
172
 
173
  # Dropdown for transcription type selection
174
  transcription_type = gr.Dropdown(
175
- choices=["Full Audio", "Interval"],
176
  label="Transcription Type",
177
  value=None,
178
  interactive=True,
@@ -187,7 +204,6 @@ def launch_demo():
187
  full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
188
 
189
  full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
190
- full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
191
 
192
  full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
193
  full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
@@ -209,7 +225,7 @@ def launch_demo():
209
  transcription_type.change(
210
  fn=lambda t: (
211
  gr.update(visible=t == "Full Audio"),
212
- gr.update(visible=t == "Interval"),
213
  ),
214
  inputs=transcription_type,
215
  outputs=[full_audio_section, interval_section],
@@ -226,7 +242,7 @@ def launch_demo():
226
  full_transcribe_btn.click(
227
  fn=load_model_and_predict,
228
  inputs=[model_name, full_audio, model_state],
229
- outputs=[full_prediction, model_state, full_textgrid_filename],
230
  )
231
 
232
  full_prediction.change(
@@ -236,25 +252,29 @@ def launch_demo():
236
  )
237
 
238
  full_textgrid_contents.change(
239
- fn=get_interactive_download_button,
240
- inputs=[full_textgrid_contents, full_textgrid_filename],
 
 
 
241
  outputs=[full_download_btn],
242
  )
243
 
 
244
  full_reset_btn.click(
245
  fn=lambda: (None, "", "", "", gr.update(interactive=False)),
246
- outputs=[full_audio, full_prediction, full_textgrid_filename, full_textgrid_contents, full_download_btn],
247
  )
248
 
249
  # Enable interval transcribe button only when both files are uploaded
250
  interval_audio.change(
251
- fn=enable_interval_transcribe_btn,
252
  inputs=[interval_audio, interval_textgrid_file],
253
  outputs=[interval_transcribe_btn],
254
  )
255
 
256
  interval_textgrid_file.change(
257
- fn=enable_interval_transcribe_btn,
258
  inputs=[interval_audio, interval_textgrid_file],
259
  outputs=[interval_transcribe_btn],
260
  )
@@ -273,8 +293,14 @@ def launch_demo():
273
  )
274
 
275
  interval_result.change(
276
- fn=lambda tg_text: gr.update(value=write_textgrid(tg_text, "interval_output.TextGrid"), interactive=True),
277
- inputs=[interval_result],
 
 
 
 
 
 
278
  outputs=[interval_download_btn],
279
  )
280
 
@@ -286,4 +312,4 @@ def launch_demo():
286
  demo.launch(max_file_size="100mb")
287
 
288
  if __name__ == "__main__":
289
- launch_demo()
 
47
  audio_in: str,
48
  model_state: dict,
49
  ):
50
+ try:
51
+ if audio_in is None:
52
+ return (
53
+ "",
54
+ model_state,
55
+ gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
56
+ )
57
+
58
+ if model_state["model_name"] != model_name:
59
+ model_state = {
60
+ "loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
61
+ "model_name": model_name,
62
+ }
63
+
64
+ prediction = model_state["loaded_model"](audio_in)["text"]
65
+ return prediction, model_state
66
+ except Exception as e:
67
+ raise gr.Error(f"Failed to load model: {str(e)}")
 
 
 
 
 
 
 
68
 
69
 
70
  def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
 
137
  return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
138
  except Exception as e:
139
  return gr.update(choices=[], value=None)
140
+
141
+
142
+ def validate_textgrid_for_intervals(audio_path, textgrid_file):
143
+ try:
144
+ if not audio_path or not textgrid_file:
145
+ return gr.update(interactive=False)
146
+
147
+ audio_duration = librosa.get_duration(path=audio_path)
148
+ tg = tgt.io.read_textgrid(textgrid_file.name)
149
+ tg_end_time = max(tier.end_time for tier in tg.tiers)
150
+
151
+ if tg_end_time > audio_duration:
152
+ raise gr.Error(
153
+ f"TextGrid ends at {tg_end_time:.2f}s but audio is only {audio_duration:.2f}s. "
154
+ "Please upload matching files."
155
+ )
156
+
157
+ epsilon = 0.01
158
+ if abs(tg_end_time - audio_duration) > epsilon:
159
+ gr.Warning(
160
+ f"TextGrid ends at {tg_end_time:.2f}s but audio is {audio_duration:.2f}s. "
161
+ "Only the annotated portion will be transcribed."
162
+ )
163
+
164
+ return gr.update(interactive=True)
165
+
166
+ except Exception as e:
167
+ raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
168
 
169
 
170
  def launch_demo():
 
175
  "model_name": DEFAULT_MODEL,
176
  }
177
 
 
 
 
 
178
  with gr.Blocks() as demo:
179
  gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
180
  This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
 
189
 
190
  # Dropdown for transcription type selection
191
  transcription_type = gr.Dropdown(
192
+ choices=["Full Audio", "TextGrid Interval"],
193
  label="Transcription Type",
194
  value=None,
195
  interactive=True,
 
204
  full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
205
 
206
  full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
 
207
 
208
  full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
209
  full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
 
225
  transcription_type.change(
226
  fn=lambda t: (
227
  gr.update(visible=t == "Full Audio"),
228
+ gr.update(visible=t == "TextGrid Interval"),
229
  ),
230
  inputs=transcription_type,
231
  outputs=[full_audio_section, interval_section],
 
242
  full_transcribe_btn.click(
243
  fn=load_model_and_predict,
244
  inputs=[model_name, full_audio, model_state],
245
+ outputs=[full_prediction, model_state],
246
  )
247
 
248
  full_prediction.change(
 
252
  )
253
 
254
  full_textgrid_contents.change(
255
+ fn=lambda tg_text, audio_path: get_interactive_download_button(
256
+ tg_text,
257
+ Path(audio_path).with_suffix(".TextGrid").name if audio_path else "output.TextGrid"
258
+ ),
259
+ inputs=[full_textgrid_contents, full_audio],
260
  outputs=[full_download_btn],
261
  )
262
 
263
+
264
  full_reset_btn.click(
265
  fn=lambda: (None, "", "", "", gr.update(interactive=False)),
266
+ outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
267
  )
268
 
269
  # Enable interval transcribe button only when both files are uploaded
270
  interval_audio.change(
271
+ fn=validate_textgrid_for_intervals,
272
  inputs=[interval_audio, interval_textgrid_file],
273
  outputs=[interval_transcribe_btn],
274
  )
275
 
276
  interval_textgrid_file.change(
277
+ fn=validate_textgrid_for_intervals,
278
  inputs=[interval_audio, interval_textgrid_file],
279
  outputs=[interval_transcribe_btn],
280
  )
 
293
  )
294
 
295
  interval_result.change(
296
+ fn=lambda tg_text, audio_path: gr.update(
297
+ value=write_textgrid(
298
+ tg_text,
299
+ Path(audio_path).with_suffix(".TextGrid").name
300
+ ),
301
+ interactive=True,
302
+ ),
303
+ inputs=[interval_result, interval_audio],
304
  outputs=[interval_download_btn],
305
  )
306
 
 
312
  demo.launch(max_file_size="100mb")
313
 
314
  if __name__ == "__main__":
315
+ launch_demo()