from pathlib import Path import tempfile import gradio as gr import librosa import tgt.core import tgt.io3 from transformers import pipeline TEXTGRID_DIR = tempfile.mkdtemp() DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa" TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file" TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name" VALID_MODELS = [ "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000", "ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_old_only_1_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_old_only_2_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_old_only_3_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_young_only_1_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_young_only_2_wav2vec2-large-xlsr-53-buckeye-ipa", "ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-53-buckeye-ipa", ] def load_model_and_predict( model_name: str, audio_in: str, model_state: dict, ): if audio_in is None: return ( "", model_state, gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False), ) if model_state["model_name"] != model_name: model_state = { "loaded_model": pipeline( task="automatic-speech-recognition", model=model_name ), "model_name": model_name, } prediction = model_state["loaded_model"](audio_in)["text"] return ( prediction, model_state, gr.Textbox( label=TEXTGRID_NAME_INPUT_LABEL, interactive=True, value=Path(audio_in).with_suffix(".TextGrid").name, ), ) def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction): if audio_in is None or transcription_prediction is None: return "" duration = librosa.get_duration(path=audio_in) annotation = tgt.core.Interval(0, duration, transcription_prediction) transcription_tier = tgt.core.IntervalTier( start_time=0, end_time=duration, name=textgrid_tier_name ) transcription_tier.add_annotation(annotation) textgrid = tgt.core.TextGrid() textgrid.add_tier(transcription_tier) return tgt.io3.export_to_long_textgrid(textgrid) def write_textgrid(textgrid_contents, textgrid_filename): """Writes the text grid contents to a named file in the temporary directory. Returns the path for download. """ textgrid_path = Path(TEXTGRID_DIR) / Path(textgrid_filename).name textgrid_path.write_text(textgrid_contents) return textgrid_path def get_interactive_download_button(textgrid_contents, textgrid_filename): return gr.DownloadButton( label=TEXTGRID_DOWNLOAD_TEXT, variant="primary", interactive=True, value=write_textgrid(textgrid_contents, textgrid_filename), ) def launch_demo(): initial_model = { "loaded_model": pipeline( task="automatic-speech-recognition", model=DEFAULT_MODEL ), "model_name": DEFAULT_MODEL, } with gr.Blocks() as demo: gr.Markdown( """# Automatic International Phonetic Alphabet Transcription This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""", ) model_name = gr.Dropdown( VALID_MODELS, value=DEFAULT_MODEL, label="IPA transcription ASR model", info="Select the model to use for prediction.", ) audio_in = gr.Audio(type="filepath", show_download_button=True) model_state = gr.State(value=initial_model) prediction = gr.Textbox(label="Predicted IPA transcription") gr.Markdown("""## TextGrid File Options Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat. """) textgrid_tier = gr.Textbox( label="TextGrid Tier Name", value="transcription", interactive=True ) textgrid_filename = gr.Textbox( label=TEXTGRID_NAME_INPUT_LABEL, interactive=False ) textgrid_contents = gr.Textbox( label="TextGrid Contents", value=get_textgrid_contents, inputs=[audio_in, textgrid_tier, prediction], ) download_btn = gr.DownloadButton( label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, # Don't allow download button to be active until an upload happened variant="primary", ) # Update prediction if model or audio changes gr.on( triggers=[audio_in.input, model_name.change], fn=load_model_and_predict, inputs=[model_name, audio_in, model_state], outputs=[prediction, model_state, textgrid_filename], ) # Download button becomes interactive if user updates audio or textgrid params gr.on( triggers=[textgrid_contents.change, textgrid_filename.change], fn=get_interactive_download_button, inputs=[textgrid_contents, textgrid_filename], outputs=[download_btn], ) demo.launch(max_file_size="100mb") if __name__ == "__main__": launch_demo()