File size: 4,364 Bytes
44993c6
 
d079ec5
 
 
 
557f37f
 
86bc5ce
 
 
557f37f
 
 
 
 
 
 
 
 
 
 
 
 
 
86bc5ce
 
557f37f
44993c6
86bc5ce
557f37f
 
 
 
 
 
 
44993c6
 
 
 
 
 
 
 
 
 
d079ec5
557f37f
d079ec5
557f37f
 
 
 
 
 
44993c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d079ec5
 
 
557f37f
d079ec5
557f37f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from pathlib import Path

import gradio as gr

from transformers import pipeline

DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"


VALID_MODELS = [
    "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
    "ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
]


def load_model_and_predict(model_name: str, audio_in: str, model_state: dict):
    if model_state["model_name"] != model_name:
        model_state = {
            "loaded_model": pipeline(
                task="automatic-speech-recognition", model=model_name
            ),
            "model_name": model_name,
        }

    return (
        model_state["loaded_model"](audio_in)["text"],
        model_state,
        gr.DownloadButton("Download TextGrid file", visible=True),
    )


def download_textgrid(audio_in, textgrid_tier_name, prediction):
    # TODO
    pass


def launch_demo():
    initial_model = {
        "loaded_model": pipeline(
            task="automatic-speech-recognition", model=DEFAULT_MODEL
        ),
        "model_name": DEFAULT_MODEL,
    }

    with gr.Blocks() as demo:
        gr.Markdown(
            """# Automatic International Phonetic Alphabet Transcription
            This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""",
        )
        model_name = gr.Dropdown(
            VALID_MODELS,
            value=DEFAULT_MODEL,
            label="IPA transcription ASR model",
            info="Select the model to use for prediction.",
        )
        audio_in = gr.Audio(type="filepath", show_download_button=True)
        model_state = gr.State(value=initial_model)

        prediction = gr.Textbox(label="Predicted IPA transcription")

        textgrid_tier = gr.Textbox(
            label="TextGrid Tier Name", value="transcription", interactive=True
        )

        download_btn = gr.DownloadButton("Download TextGrid file", visible=False)

        # If user updates model name or audio, run prediction
        audio_in.input(
            fn=load_model_and_predict,
            inputs=[model_name, audio_in, model_state],
            outputs=[prediction, model_state, download_btn],
        )
        model_name.change(
            fn=load_model_and_predict,
            inputs=[model_name, audio_in, model_state],
            outputs=[prediction, model_state, download_btn],
        )

    # demo = gr.Interface(
    #     fn=load_model_and_predict,
    #     inputs=[
    #         gr.Dropdown(
    #             VALID_MODELS,
    #             value=DEFAULT_MODEL,
    #             label="IPA transcription ASR model",
    #             info="Select the model to use for prediction.",
    #         ),
    #         gr.Audio(type="filepath", show_download_button=True),
    #         gr.State(
    #             value=initial_model
    #         ),  # Store the name of the currently loaded model
    #     ],
    #     outputs=[gr.Textbox(label="Predicted IPA transcription"), gr.State()],
    #     allow_flagging="never",
    #     title="Automatic International Phonetic Alphabet Transcription",
    #     description="This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.",
    # )

    demo.launch()


if __name__ == "__main__":
    launch_demo()