Spaces:
Running
Running
from pathlib import Path | |
import gradio as gr | |
from transformers import pipeline | |
DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa" | |
VALID_MODELS = [ | |
"ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000", | |
"ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa", | |
"ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa", | |
] | |
def load_model_and_predict(model_name: str, audio_in: str, model_state: dict): | |
if model_state["model_name"] != model_name: | |
model_state = { | |
"loaded_model": pipeline( | |
task="automatic-speech-recognition", model=model_name | |
), | |
"model_name": model_name, | |
} | |
return ( | |
model_state["loaded_model"](audio_in)["text"], | |
model_state, | |
gr.DownloadButton("Download TextGrid file", visible=True), | |
) | |
def download_textgrid(audio_in, textgrid_tier_name, prediction): | |
# TODO | |
pass | |
def launch_demo(): | |
initial_model = { | |
"loaded_model": pipeline( | |
task="automatic-speech-recognition", model=DEFAULT_MODEL | |
), | |
"model_name": DEFAULT_MODEL, | |
} | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
"""# Automatic International Phonetic Alphabet Transcription | |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""", | |
) | |
model_name = gr.Dropdown( | |
VALID_MODELS, | |
value=DEFAULT_MODEL, | |
label="IPA transcription ASR model", | |
info="Select the model to use for prediction.", | |
) | |
audio_in = gr.Audio(type="filepath", show_download_button=True) | |
model_state = gr.State(value=initial_model) | |
prediction = gr.Textbox(label="Predicted IPA transcription") | |
textgrid_tier = gr.Textbox( | |
label="TextGrid Tier Name", value="transcription", interactive=True | |
) | |
download_btn = gr.DownloadButton("Download TextGrid file", visible=False) | |
# If user updates model name or audio, run prediction | |
audio_in.input( | |
fn=load_model_and_predict, | |
inputs=[model_name, audio_in, model_state], | |
outputs=[prediction, model_state, download_btn], | |
) | |
model_name.change( | |
fn=load_model_and_predict, | |
inputs=[model_name, audio_in, model_state], | |
outputs=[prediction, model_state, download_btn], | |
) | |
# demo = gr.Interface( | |
# fn=load_model_and_predict, | |
# inputs=[ | |
# gr.Dropdown( | |
# VALID_MODELS, | |
# value=DEFAULT_MODEL, | |
# label="IPA transcription ASR model", | |
# info="Select the model to use for prediction.", | |
# ), | |
# gr.Audio(type="filepath", show_download_button=True), | |
# gr.State( | |
# value=initial_model | |
# ), # Store the name of the currently loaded model | |
# ], | |
# outputs=[gr.Textbox(label="Predicted IPA transcription"), gr.State()], | |
# allow_flagging="never", | |
# title="Automatic International Phonetic Alphabet Transcription", | |
# description="This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.", | |
# ) | |
demo.launch() | |
if __name__ == "__main__": | |
launch_demo() | |