Spaces:
Sleeping
Sleeping
from pathlib import Path | |
import tempfile | |
import gradio as gr | |
import librosa | |
import tgt.core | |
import tgt.io3 | |
from transformers import pipeline | |
TEXTGRID_DIR = tempfile.mkdtemp() | |
DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa" | |
TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file" | |
TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name" | |
VALID_MODELS = [ | |
"ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000", | |
"ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_old_only_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_old_only_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_old_only_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_young_only_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_young_only_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
] | |
def load_model_and_predict( | |
model_name: str, | |
audio_in: str, | |
model_state: dict, | |
): | |
if audio_in is None: | |
return ( | |
"", | |
model_state, | |
gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False), | |
) | |
if model_state["model_name"] != model_name: | |
model_state = { | |
"loaded_model": pipeline( | |
task="automatic-speech-recognition", model=model_name | |
), | |
"model_name": model_name, | |
} | |
prediction = model_state["loaded_model"](audio_in)["text"] | |
return ( | |
prediction, | |
model_state, | |
gr.Textbox( | |
label=TEXTGRID_NAME_INPUT_LABEL, | |
interactive=True, | |
value=Path(audio_in).with_suffix(".TextGrid").name, | |
), | |
) | |
def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction): | |
if audio_in is None or transcription_prediction is None: | |
return "" | |
duration = librosa.get_duration(path=audio_in) | |
annotation = tgt.core.Interval(0, duration, transcription_prediction) | |
transcription_tier = tgt.core.IntervalTier( | |
start_time=0, end_time=duration, name=textgrid_tier_name | |
) | |
transcription_tier.add_annotation(annotation) | |
textgrid = tgt.core.TextGrid() | |
textgrid.add_tier(transcription_tier) | |
return tgt.io3.export_to_long_textgrid(textgrid) | |
def write_textgrid(textgrid_contents, textgrid_filename): | |
"""Writes the text grid contents to a named file in the temporary directory. | |
Returns the path for download. | |
""" | |
textgrid_path = Path(TEXTGRID_DIR) / Path(textgrid_filename).name | |
textgrid_path.write_text(textgrid_contents) | |
return textgrid_path | |
def get_interactive_download_button(textgrid_contents, textgrid_filename): | |
return gr.DownloadButton( | |
label=TEXTGRID_DOWNLOAD_TEXT, | |
variant="primary", | |
interactive=True, | |
value=write_textgrid(textgrid_contents, textgrid_filename), | |
) | |
def launch_demo(): | |
initial_model = { | |
"loaded_model": pipeline( | |
task="automatic-speech-recognition", model=DEFAULT_MODEL | |
), | |
"model_name": DEFAULT_MODEL, | |
} | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
"""# Automatic International Phonetic Alphabet Transcription | |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""", | |
) | |
model_name = gr.Dropdown( | |
VALID_MODELS, | |
value=DEFAULT_MODEL, | |
label="IPA transcription ASR model", | |
info="Select the model to use for prediction.", | |
) | |
audio_in = gr.Audio(type="filepath", show_download_button=True) | |
model_state = gr.State(value=initial_model) | |
prediction = gr.Textbox(label="Predicted IPA transcription") | |
gr.Markdown("""## TextGrid File Options | |
Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat. | |
""") | |
textgrid_tier = gr.Textbox( | |
label="TextGrid Tier Name", value="transcription", interactive=True | |
) | |
textgrid_filename = gr.Textbox( | |
label=TEXTGRID_NAME_INPUT_LABEL, interactive=False | |
) | |
textgrid_contents = gr.Textbox( | |
label="TextGrid Contents", | |
value=get_textgrid_contents, | |
inputs=[audio_in, textgrid_tier, prediction], | |
) | |
download_btn = gr.DownloadButton( | |
label=TEXTGRID_DOWNLOAD_TEXT, | |
interactive=False, # Don't allow download button to be active until an upload happened | |
variant="primary", | |
) | |
# Update prediction if model or audio changes | |
gr.on( | |
triggers=[audio_in.input, model_name.change], | |
fn=load_model_and_predict, | |
inputs=[model_name, audio_in, model_state], | |
outputs=[prediction, model_state, textgrid_filename], | |
) | |
# Download button becomes interactive if user updates audio or textgrid params | |
gr.on( | |
triggers=[textgrid_contents.change, textgrid_filename.change], | |
fn=get_interactive_download_button, | |
inputs=[textgrid_contents, textgrid_filename], | |
outputs=[download_btn], | |
) | |
demo.launch(max_file_size="100mb") | |
if __name__ == "__main__": | |
launch_demo() | |