File size: 6,646 Bytes
ad16788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from espnet2.bin.tts_inference import Text2Speech
import torch
from parallel_wavegan.utils import download_pretrained_model, load_model
from phonemizer import phonemize
from phonemizer.separator import Separator
import gradio as gr

s = Separator(word=None, phone=" ")
config_path = "config.yaml"
model_path = "model.pth"

vocoder_tag = "ljspeech_parallel_wavegan.v3"

vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cpu").eval()
vocoder.remove_weight_norm()

global_styles = {
    "Style 1": torch.load("style1.pt"),
    "Style 2": torch.load("style2.pt"),
    "Style 3": torch.load("style3.pt"),
    "Style 4": torch.load("style4.pt"),
    "Style 5": torch.load("style5.pt"),
    "Style 6": torch.load("style6.pt"),
}


def inference(text, global_style, alpha, prev_fg_inds, input_fg_inds):
    with torch.no_grad():
        text2speech = Text2Speech(
            config_path,
            model_path,
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
            # Only for FastSpeech & FastSpeech2
            speed_control_alpha=alpha,
        )
        text2speech.spc2wav = None  # Disable griffin-lim

        style_emb = torch.flatten(global_styles[global_style])

        phoneme_string = phonemize(
            text, language="mb-us1", backend="espeak-mbrola", separator=s
        )
        phonemes = phoneme_string.split(" ")

        max_edit_index = -1
        for i in range(len(input_fg_inds) - 1, -1, -1):
            if input_fg_inds[i] != "":
                max_edit_index = i
                break

        if max_edit_index == -1:
            _, c, _, _, _, _, _, output_fg_inds = text2speech(
                phoneme_string, ref_embs=style_emb
            )

        else:
            input_fg_inds_int_list = []
            for i in range(max_edit_index + 1):
                if input_fg_inds[i] != "":
                    input_fg_inds_int_list.append(int(input_fg_inds[i]))
                else:
                    input_fg_inds_int_list.append(prev_fg_inds[i][1])
            input_fg_inds = input_fg_inds_int_list

            prev_fg_inds_list = [[[row[1], row[2], row[3]] for row in prev_fg_inds]]
            prev_fg_inds = torch.tensor(prev_fg_inds_list, dtype=torch.int64)

            fg_inds = torch.tensor(input_fg_inds_int_list).unsqueeze(0)
            _, c, _, _, _, _, _, part_output_fg_inds = text2speech(
                phoneme_string, ref_embs=style_emb, fg_inds=fg_inds
            )

            prev_fg_inds[0, max_edit_index + 1 :, :] = part_output_fg_inds[0]
            output_fg_inds = prev_fg_inds

        output_fg_inds_list = output_fg_inds.tolist()[0]
        padded_phonemes = ["", *phonemes]
        dataframe_values = [
            [phoneme, *fgs]
            for phoneme, fgs in zip(padded_phonemes, output_fg_inds_list)
        ]
        selected_inds = [
            [input_fg_inds[i]] if i < len(input_fg_inds) else [""]
            for i in range(len(padded_phonemes))
        ]
        wav = vocoder.inference(c)

        return [
            (22050, wav.view(-1).cpu().numpy()),
            dataframe_values,
            selected_inds,
        ]


demo = gr.Blocks()

with demo:
    gr.Markdown(
        """
    
    # ConEx Demo
    
    This demo shows the capabilities of ConEx, a model for **Con**trollable **Ex**pressive speech synthesis. 
    ConEx allows you to generate speech in a certain speaking style, and gives you the ability to edit the prosody* of the generated speech at a fine level.
    We proposed ConEx in our paper titled ["Interactive Multi-Level Prosody Control for Expressive Speech Synthesis"](https://jessa.github.io/assets/pdf/cornille2022icassp.pdf), published in proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2022.

    To convert text to speech: input some text, choose the desired speaking style, set the duration factor (higher = slower speech), and press "Generate speech".
    
    **prosody refers to speech characteristics such as intonation, stress, rhythm*
    """
    )

    with gr.Row():
        text_input = gr.Textbox(
            label="Input text",
            lines=4,
            placeholder="E.g. I didn't say he stole the money",
        )

        with gr.Column():
            global_style_dropdown = gr.Dropdown(
                ["Style 1", "Style 2", "Style 3", "Style 4", "Style 5", "Style 6"],
                value="Style 1",
                label="Global speaking style",
            )
            alpha_slider = gr.Slider(
                0.1, 2, value=1, step=0.1, label="Alpha (duration factor)"
            )

    audio = gr.Audio()
    with gr.Row():
        button = gr.Button("Generate Speech")

    gr.Markdown(
        """
    
    ### Fine-grained prosody editor
    Once you've generated some speech, the following table will show the id of the prosody embedding used for each phoneme.
    A prosody embedding determines the prosody of the phoneme. 
    The table not only shows the prosody embeddings that are used by default (the top predictions), but also two more likely prosody embeddings.
    
    In order to change the prosody of a phoneme, write a new prosody embedding id in the "Chosen prosody embeddings" column and press "Generate speech" again. 
    You can use any number from 0-31, but the 2nd and 3rd predictions are more likely to give a fitting prosody.
    Based on your edit, new prosody embeddings will be generated for the phonemes after the edit. 
    Thus, you can iteratively change the prosody by starting from the beginning of the utterance and working your through the utterance, making edits as you see fit.
    The prosody embeddings before your edit will remain the same as before, and will be copied to the "Chosen prosody embeddings" column.
    """
    )

    with gr.Row():
        phoneme_preds_df = gr.Dataframe(
            headers=["Phoneme", "🥇 Top pred", "🥈 2nd pred", "🥉 3rd pred"],
            type="array",
            col_count=(4, "static"),
        )
        phoneme_edits_df = gr.Dataframe(
            headers=["Chosen prosody embeddings"], type="array", col_count=(1, "static")
        )

    button.click(
        inference,
        inputs=[
            text_input,
            global_style_dropdown,
            alpha_slider,
            phoneme_preds_df,
            phoneme_edits_df,
        ],
        outputs=[audio, phoneme_preds_df, phoneme_edits_df],
    )


demo.launch()