Spaces:
Running
Running
Error Handling + Joe's Suggestion
Browse filesError Handling For Model Loading
Error Handling for audio duration and textgrid length (warning if audio>textgrid - only annotated will be transcribed, error is textgrid>audio)
Changed Dropdown Menu Option Name (Joe Emailed About This)
app.py
CHANGED
@@ -47,31 +47,32 @@ def load_model_and_predict(
|
|
47 |
audio_in: str,
|
48 |
model_state: dict,
|
49 |
):
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
return (
|
52 |
-
|
53 |
model_state,
|
54 |
-
gr.Textbox(
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
model_state = {
|
59 |
-
"loaded_model": pipeline(
|
60 |
-
task="automatic-speech-recognition", model=model_name
|
61 |
),
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
prediction = model_state["loaded_model"](audio_in)["text"]
|
66 |
-
return (
|
67 |
-
prediction,
|
68 |
-
model_state,
|
69 |
-
gr.Textbox(
|
70 |
-
label=TEXTGRID_NAME_INPUT_LABEL,
|
71 |
-
interactive=True,
|
72 |
-
value=Path(audio_in).with_suffix(".TextGrid").name,
|
73 |
-
),
|
74 |
-
)
|
75 |
|
76 |
|
77 |
def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
|
@@ -144,6 +145,34 @@ def extract_tier_names(textgrid_file):
|
|
144 |
return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
|
145 |
except Exception as e:
|
146 |
return gr.update(choices=[], value=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
|
149 |
def launch_demo():
|
@@ -154,10 +183,6 @@ def launch_demo():
|
|
154 |
"model_name": DEFAULT_MODEL,
|
155 |
}
|
156 |
|
157 |
-
# Helper function - enables the interval transcribe button
|
158 |
-
def enable_interval_transcribe_btn(audio, textgrid):
|
159 |
-
return gr.update(interactive=(audio is not None and textgrid is not None))
|
160 |
-
|
161 |
with gr.Blocks() as demo:
|
162 |
gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
|
163 |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
|
@@ -172,7 +197,7 @@ def launch_demo():
|
|
172 |
|
173 |
# Dropdown for transcription type selection
|
174 |
transcription_type = gr.Dropdown(
|
175 |
-
choices=["Full Audio", "Interval"],
|
176 |
label="Transcription Type",
|
177 |
value=None,
|
178 |
interactive=True,
|
@@ -209,7 +234,7 @@ def launch_demo():
|
|
209 |
transcription_type.change(
|
210 |
fn=lambda t: (
|
211 |
gr.update(visible=t == "Full Audio"),
|
212 |
-
gr.update(visible=t == "Interval"),
|
213 |
),
|
214 |
inputs=transcription_type,
|
215 |
outputs=[full_audio_section, interval_section],
|
@@ -248,13 +273,13 @@ def launch_demo():
|
|
248 |
|
249 |
# Enable interval transcribe button only when both files are uploaded
|
250 |
interval_audio.change(
|
251 |
-
fn=
|
252 |
inputs=[interval_audio, interval_textgrid_file],
|
253 |
outputs=[interval_transcribe_btn],
|
254 |
)
|
255 |
|
256 |
interval_textgrid_file.change(
|
257 |
-
fn=
|
258 |
inputs=[interval_audio, interval_textgrid_file],
|
259 |
outputs=[interval_transcribe_btn],
|
260 |
)
|
@@ -286,4 +311,4 @@ def launch_demo():
|
|
286 |
demo.launch(max_file_size="100mb")
|
287 |
|
288 |
if __name__ == "__main__":
|
289 |
-
launch_demo()
|
|
|
47 |
audio_in: str,
|
48 |
model_state: dict,
|
49 |
):
|
50 |
+
try:
|
51 |
+
if audio_in is None:
|
52 |
+
return (
|
53 |
+
"",
|
54 |
+
model_state,
|
55 |
+
gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
|
56 |
+
)
|
57 |
+
|
58 |
+
if model_state["model_name"] != model_name:
|
59 |
+
model_state = {
|
60 |
+
"loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
|
61 |
+
"model_name": model_name,
|
62 |
+
}
|
63 |
+
|
64 |
+
prediction = model_state["loaded_model"](audio_in)["text"]
|
65 |
return (
|
66 |
+
prediction,
|
67 |
model_state,
|
68 |
+
gr.Textbox(
|
69 |
+
label=TEXTGRID_NAME_INPUT_LABEL,
|
70 |
+
interactive=True,
|
71 |
+
value=Path(audio_in).with_suffix(".TextGrid").name,
|
|
|
|
|
|
|
72 |
),
|
73 |
+
)
|
74 |
+
except Exception as e:
|
75 |
+
raise gr.Error(f"Failed to load model: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
|
|
|
145 |
return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
|
146 |
except Exception as e:
|
147 |
return gr.update(choices=[], value=None)
|
148 |
+
|
149 |
+
|
150 |
+
def validate_textgrid_for_intervals(audio_path, textgrid_file):
|
151 |
+
try:
|
152 |
+
if not audio_path or not textgrid_file:
|
153 |
+
return gr.update(interactive=False)
|
154 |
+
|
155 |
+
audio_duration = librosa.get_duration(path=audio_path)
|
156 |
+
tg = tgt.io.read_textgrid(textgrid_file.name)
|
157 |
+
tg_end_time = max(tier.end_time for tier in tg.tiers)
|
158 |
+
|
159 |
+
if tg_end_time > audio_duration:
|
160 |
+
raise gr.Error(
|
161 |
+
f"TextGrid ends at {tg_end_time:.2f}s but audio is only {audio_duration:.2f}s. "
|
162 |
+
"Please upload matching files."
|
163 |
+
)
|
164 |
+
|
165 |
+
epsilon = 0.01
|
166 |
+
if abs(tg_end_time - audio_duration) > epsilon:
|
167 |
+
gr.Warning(
|
168 |
+
f"TextGrid ends at {tg_end_time:.2f}s but audio is {audio_duration:.2f}s. "
|
169 |
+
"Only the annotated portion will be transcribed."
|
170 |
+
)
|
171 |
+
|
172 |
+
return gr.update(interactive=True)
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
|
176 |
|
177 |
|
178 |
def launch_demo():
|
|
|
183 |
"model_name": DEFAULT_MODEL,
|
184 |
}
|
185 |
|
|
|
|
|
|
|
|
|
186 |
with gr.Blocks() as demo:
|
187 |
gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
|
188 |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
|
|
|
197 |
|
198 |
# Dropdown for transcription type selection
|
199 |
transcription_type = gr.Dropdown(
|
200 |
+
choices=["Full Audio", "TextGrid Interval"],
|
201 |
label="Transcription Type",
|
202 |
value=None,
|
203 |
interactive=True,
|
|
|
234 |
transcription_type.change(
|
235 |
fn=lambda t: (
|
236 |
gr.update(visible=t == "Full Audio"),
|
237 |
+
gr.update(visible=t == "TextGrid Interval"),
|
238 |
),
|
239 |
inputs=transcription_type,
|
240 |
outputs=[full_audio_section, interval_section],
|
|
|
273 |
|
274 |
# Enable interval transcribe button only when both files are uploaded
|
275 |
interval_audio.change(
|
276 |
+
fn=validate_textgrid_for_intervals,
|
277 |
inputs=[interval_audio, interval_textgrid_file],
|
278 |
outputs=[interval_transcribe_btn],
|
279 |
)
|
280 |
|
281 |
interval_textgrid_file.change(
|
282 |
+
fn=validate_textgrid_for_intervals,
|
283 |
inputs=[interval_audio, interval_textgrid_file],
|
284 |
outputs=[interval_transcribe_btn],
|
285 |
)
|
|
|
311 |
demo.launch(max_file_size="100mb")
|
312 |
|
313 |
if __name__ == "__main__":
|
314 |
+
launch_demo()
|