ginic commited on
Commit
716f8aa
·
verified ·
1 Parent(s): ef1761e

TextGrid Interval Support + UI Changes + Dependency Update (#2)

Browse files

- Interval+UI+Dependency (4c2f3f651c086f6dd72cb7ff211c74f7e67b42fc)

Files changed (2) hide show
  1. app.py +150 -36
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,18 +1,21 @@
 
1
  from pathlib import Path
2
  import tempfile
3
-
4
  import gradio as gr
5
  import librosa
6
  import tgt.core
7
  import tgt.io3
 
8
  from transformers import pipeline
9
 
 
10
  TEXTGRID_DIR = tempfile.mkdtemp()
11
  DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
12
  TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
13
  TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"
14
 
15
-
16
  VALID_MODELS = [
17
  "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
18
  "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
@@ -105,6 +108,44 @@ def get_interactive_download_button(textgrid_contents, textgrid_filename):
105
  )
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def launch_demo():
109
  initial_model = {
110
  "loaded_model": pipeline(
@@ -113,63 +154,136 @@ def launch_demo():
113
  "model_name": DEFAULT_MODEL,
114
  }
115
 
 
 
 
 
116
  with gr.Blocks() as demo:
117
- gr.Markdown(
118
- """# Automatic International Phonetic Alphabet Transcription
119
- This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""",
120
- )
121
  model_name = gr.Dropdown(
122
  VALID_MODELS,
123
  value=DEFAULT_MODEL,
124
  label="IPA transcription ASR model",
125
  info="Select the model to use for prediction.",
126
  )
127
- audio_in = gr.Audio(type="filepath", show_download_button=True)
 
 
 
 
 
 
 
 
128
  model_state = gr.State(value=initial_model)
129
 
130
- prediction = gr.Textbox(label="Predicted IPA transcription")
 
 
 
 
131
 
132
- gr.Markdown("""## TextGrid File Options
133
- Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat.
134
- """)
135
- textgrid_tier = gr.Textbox(
136
- label="TextGrid Tier Name", value="transcription", interactive=True
137
- )
138
 
139
- textgrid_filename = gr.Textbox(
140
- label=TEXTGRID_NAME_INPUT_LABEL, interactive=False
141
- )
142
 
143
- textgrid_contents = gr.Textbox(
144
- label="TextGrid Contents",
145
- value=get_textgrid_contents,
146
- inputs=[audio_in, textgrid_tier, prediction],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  )
148
 
149
- download_btn = gr.DownloadButton(
150
- label=TEXTGRID_DOWNLOAD_TEXT,
151
- interactive=False, # Don't allow download button to be active until an upload happened
152
- variant="primary",
 
153
  )
154
 
155
- # Update prediction if model or audio changes
156
- gr.on(
157
- triggers=[audio_in.input, model_name.change],
158
  fn=load_model_and_predict,
159
- inputs=[model_name, audio_in, model_state],
160
- outputs=[prediction, model_state, textgrid_filename],
 
 
 
 
 
 
161
  )
162
 
163
- # Download button becomes interactive if user updates audio or textgrid params
164
- gr.on(
165
- triggers=[textgrid_contents.change, textgrid_filename.change],
166
  fn=get_interactive_download_button,
167
- inputs=[textgrid_contents, textgrid_filename],
168
- outputs=[download_btn],
169
  )
170
 
171
- demo.launch(max_file_size="100mb")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  if __name__ == "__main__":
175
  launch_demo()
 
1
+ # Imports
2
  from pathlib import Path
3
  import tempfile
4
+ import os
5
  import gradio as gr
6
  import librosa
7
  import tgt.core
8
  import tgt.io3
9
+ import soundfile as sf
10
  from transformers import pipeline
11
 
12
+ # Constants
13
  TEXTGRID_DIR = tempfile.mkdtemp()
14
  DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
15
  TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
16
  TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"
17
 
18
+ # Selection of models
19
  VALID_MODELS = [
20
  "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
21
  "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
 
108
  )
109
 
110
 
111
+ def transcribe_intervals(audio_in, textgrid_path, source_tier, target_tier, model_state):
112
+ if audio_in is None or textgrid_path is None:
113
+ return "Missing audio or TextGrid input file."
114
+
115
+ tg=tgt.io.read_textgrid(textgrid_path.name)
116
+ tier = tg.get_tier_by_name(source_tier)
117
+ ipa_tier = tgt.core.IntervalTier(name=target_tier)
118
+
119
+ for interval in tier.intervals:
120
+ if not interval.text.strip(): # Skip empty text intervals
121
+ continue
122
+
123
+ start, end = interval.start_time, interval.end_time
124
+ try:
125
+ y, sr = librosa.load(audio_in, sr=None, offset=start, duration=end-start)
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
127
+ sf.write(temp_audio.name, y, sr)
128
+ prediction = model_state["loaded_model"](temp_audio.name)["text"]
129
+ ipa_tier.add_annotation(tgt.core.Interval(start, end, prediction))
130
+ os.remove(temp_audio.name)
131
+ except Exception as e:
132
+ ipa_tier.add_annotation(tgt.core.Interval(start, end, f"[Error]: {str(e)}"))
133
+
134
+ tg.add_tier(ipa_tier)
135
+ tgt_str = tgt.io3.export_to_long_textgrid(tg)
136
+
137
+ return tgt_str
138
+
139
+
140
+ def extract_tier_names(textgrid_file):
141
+ try:
142
+ tg = tgt.io.read_textgrid(textgrid_file.name)
143
+ tier_names = [tier.name for tier in tg.tiers]
144
+ return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
145
+ except Exception as e:
146
+ return gr.update(choices=[], value=None)
147
+
148
+
149
  def launch_demo():
150
  initial_model = {
151
  "loaded_model": pipeline(
 
154
  "model_name": DEFAULT_MODEL,
155
  }
156
 
157
+ # Helper function - enables the interval transcribe button
158
+ def enable_interval_transcribe_btn(audio, textgrid):
159
+ return gr.update(interactive=(audio is not None and textgrid is not None))
160
+
161
  with gr.Blocks() as demo:
162
+ gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
163
+ This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
164
+
165
+ # Dropdown for model selection
166
  model_name = gr.Dropdown(
167
  VALID_MODELS,
168
  value=DEFAULT_MODEL,
169
  label="IPA transcription ASR model",
170
  info="Select the model to use for prediction.",
171
  )
172
+
173
+ # Dropdown for transcription type selection
174
+ transcription_type = gr.Dropdown(
175
+ choices=["Full Audio", "Interval"],
176
+ label="Transcription Type",
177
+ value=None,
178
+ interactive=True,
179
+ )
180
+
181
  model_state = gr.State(value=initial_model)
182
 
183
+ # Full audio transcription section
184
+ with gr.Column(visible=False) as full_audio_section:
185
+ full_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
186
+ full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
187
+ full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
188
 
189
+ full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
190
+ full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
 
 
 
 
191
 
192
+ full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
193
+ full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
194
+ full_reset_btn = gr.Button("Reset", variant="secondary")
195
 
196
+ # Interval transcription section
197
+ with gr.Column(visible=False) as interval_section:
198
+ interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
199
+ interval_textgrid_file = gr.File(file_types=[".TextGrid"], label="Upload TextGrid File")
200
+ tier_names = gr.Dropdown(label="Source Tier (existing)", choices=[], interactive=True)
201
+ target_tier = gr.Textbox(label="Target Tier (new)", value="IPATier", placeholder="e.g. IPATier")
202
+
203
+ interval_transcribe_btn = gr.Button("Transcribe Intervals", interactive=False, variant="primary")
204
+ interval_result = gr.Textbox(label="IPA Interval Transcription", show_copy_button=True, interactive=False)
205
+ interval_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
206
+ interval_reset_btn = gr.Button("Reset", variant="secondary")
207
+
208
+ # Section visibility toggle
209
+ transcription_type.change(
210
+ fn=lambda t: (
211
+ gr.update(visible=t == "Full Audio"),
212
+ gr.update(visible=t == "Interval"),
213
+ ),
214
+ inputs=transcription_type,
215
+ outputs=[full_audio_section, interval_section],
216
  )
217
 
218
+ # Enable full transcribe button after audio uploaded
219
+ full_audio.change(
220
+ fn=lambda audio: gr.update(interactive=audio is not None),
221
+ inputs=full_audio,
222
+ outputs=full_transcribe_btn,
223
  )
224
 
225
+ # Full transcription logic
226
+ full_transcribe_btn.click(
 
227
  fn=load_model_and_predict,
228
+ inputs=[model_name, full_audio, model_state],
229
+ outputs=[full_prediction, model_state, full_textgrid_filename],
230
+ )
231
+
232
+ full_prediction.change(
233
+ fn=get_textgrid_contents,
234
+ inputs=[full_audio, full_textgrid_tier, full_prediction],
235
+ outputs=[full_textgrid_contents],
236
  )
237
 
238
+ full_textgrid_contents.change(
 
 
239
  fn=get_interactive_download_button,
240
+ inputs=[full_textgrid_contents, full_textgrid_filename],
241
+ outputs=[full_download_btn],
242
  )
243
 
244
+ full_reset_btn.click(
245
+ fn=lambda: (None, "", "", "", gr.update(interactive=False)),
246
+ outputs=[full_audio, full_prediction, full_textgrid_filename, full_textgrid_contents, full_download_btn],
247
+ )
248
+
249
+ # Enable interval transcribe button only when both files are uploaded
250
+ interval_audio.change(
251
+ fn=enable_interval_transcribe_btn,
252
+ inputs=[interval_audio, interval_textgrid_file],
253
+ outputs=[interval_transcribe_btn],
254
+ )
255
+
256
+ interval_textgrid_file.change(
257
+ fn=enable_interval_transcribe_btn,
258
+ inputs=[interval_audio, interval_textgrid_file],
259
+ outputs=[interval_transcribe_btn],
260
+ )
261
+
262
+ # Interval logic
263
+ interval_textgrid_file.change(
264
+ fn=extract_tier_names,
265
+ inputs=[interval_textgrid_file],
266
+ outputs=[tier_names],
267
+ )
268
+
269
+ interval_transcribe_btn.click(
270
+ fn=transcribe_intervals,
271
+ inputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, model_state],
272
+ outputs=[interval_result],
273
+ )
274
 
275
+ interval_result.change(
276
+ fn=lambda tg_text: gr.update(value=write_textgrid(tg_text, "interval_output.TextGrid"), interactive=True),
277
+ inputs=[interval_result],
278
+ outputs=[interval_download_btn],
279
+ )
280
+
281
+ interval_reset_btn.click(
282
+ fn=lambda: (None, None, gr.update(choices=[]), "IPATier", "", gr.update(interactive=False)),
283
+ outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
284
+ )
285
+
286
+ demo.launch(max_file_size="100mb")
287
 
288
  if __name__ == "__main__":
289
  launch_demo()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  ffmpeg
2
  librosa
3
  tgt
4
- transformers[torch]
 
 
1
  ffmpeg
2
  librosa
3
  tgt
4
+ transformers[torch]
5
+ soundfile