|
from espnet2.bin.tts_inference import Text2Speech |
|
import torch |
|
from parallel_wavegan.utils import download_pretrained_model, load_model |
|
from phonemizer import phonemize |
|
from phonemizer.separator import Separator |
|
import gradio as gr |
|
|
|
s = Separator(word=None, phone=" ") |
|
config_path = "config.yaml" |
|
model_path = "model.pth" |
|
|
|
vocoder_tag = "ljspeech_parallel_wavegan.v3" |
|
|
|
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cpu").eval() |
|
vocoder.remove_weight_norm() |
|
|
|
global_styles = { |
|
"Style 1": torch.load("style1.pt"), |
|
"Style 2": torch.load("style2.pt"), |
|
"Style 3": torch.load("style3.pt"), |
|
"Style 4": torch.load("style4.pt"), |
|
"Style 5": torch.load("style5.pt"), |
|
"Style 6": torch.load("style6.pt"), |
|
} |
|
|
|
|
|
def inference(text, global_style, alpha, prev_fg_inds, input_fg_inds): |
|
with torch.no_grad(): |
|
text2speech = Text2Speech( |
|
config_path, |
|
model_path, |
|
device="cpu", |
|
|
|
threshold=0.5, |
|
minlenratio=0.0, |
|
maxlenratio=10.0, |
|
use_att_constraint=False, |
|
backward_window=1, |
|
forward_window=3, |
|
|
|
speed_control_alpha=alpha, |
|
) |
|
text2speech.spc2wav = None |
|
|
|
style_emb = torch.flatten(global_styles[global_style]) |
|
|
|
phoneme_string = phonemize( |
|
text, language="mb-us1", backend="espeak-mbrola", separator=s |
|
) |
|
phonemes = phoneme_string.split(" ") |
|
|
|
max_edit_index = -1 |
|
for i in range(len(input_fg_inds) - 1, -1, -1): |
|
if input_fg_inds[i] != "": |
|
max_edit_index = i |
|
break |
|
|
|
if max_edit_index == -1: |
|
_, c, _, _, _, _, _, output_fg_inds = text2speech( |
|
phoneme_string, ref_embs=style_emb |
|
) |
|
|
|
else: |
|
input_fg_inds_int_list = [] |
|
for i in range(max_edit_index + 1): |
|
if input_fg_inds[i] != "": |
|
input_fg_inds_int_list.append(int(input_fg_inds[i])) |
|
else: |
|
input_fg_inds_int_list.append(prev_fg_inds[i][1]) |
|
input_fg_inds = input_fg_inds_int_list |
|
|
|
prev_fg_inds_list = [[[row[1], row[2], row[3]] for row in prev_fg_inds]] |
|
prev_fg_inds = torch.tensor(prev_fg_inds_list, dtype=torch.int64) |
|
|
|
fg_inds = torch.tensor(input_fg_inds_int_list).unsqueeze(0) |
|
_, c, _, _, _, _, _, part_output_fg_inds = text2speech( |
|
phoneme_string, ref_embs=style_emb, fg_inds=fg_inds |
|
) |
|
|
|
prev_fg_inds[0, max_edit_index + 1 :, :] = part_output_fg_inds[0] |
|
output_fg_inds = prev_fg_inds |
|
|
|
output_fg_inds_list = output_fg_inds.tolist()[0] |
|
padded_phonemes = ["", *phonemes] |
|
dataframe_values = [ |
|
[phoneme, *fgs] |
|
for phoneme, fgs in zip(padded_phonemes, output_fg_inds_list) |
|
] |
|
selected_inds = [ |
|
[input_fg_inds[i]] if i < len(input_fg_inds) else [""] |
|
for i in range(len(padded_phonemes)) |
|
] |
|
wav = vocoder.inference(c) |
|
|
|
return [ |
|
(22050, wav.view(-1).cpu().numpy()), |
|
dataframe_values, |
|
selected_inds, |
|
] |
|
|
|
|
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown( |
|
""" |
|
|
|
# ConEx Demo |
|
|
|
This demo shows the capabilities of ConEx, a model for **Con**trollable **Ex**pressive speech synthesis. |
|
ConEx allows you to generate speech in a certain speaking style, and gives you the ability to edit the prosody* of the generated speech at a fine level. |
|
We proposed ConEx in our paper titled ["Interactive Multi-Level Prosody Control for Expressive Speech Synthesis"](https://jessa.github.io/assets/pdf/cornille2022icassp.pdf), published in proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2022. |
|
|
|
To convert text to speech: input some text, choose the desired speaking style, set the duration factor (higher = slower speech), and press "Generate speech". |
|
|
|
**prosody refers to speech characteristics such as intonation, stress, rhythm* |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
text_input = gr.Textbox( |
|
label="Input text", |
|
lines=4, |
|
placeholder="E.g. I didn't say he stole the money", |
|
) |
|
|
|
with gr.Column(): |
|
global_style_dropdown = gr.Dropdown( |
|
["Style 1", "Style 2", "Style 3", "Style 4", "Style 5", "Style 6"], |
|
value="Style 1", |
|
label="Global speaking style", |
|
) |
|
alpha_slider = gr.Slider( |
|
0.1, 2, value=1, step=0.1, label="Alpha (duration factor)" |
|
) |
|
|
|
audio = gr.Audio() |
|
with gr.Row(): |
|
button = gr.Button("Generate Speech") |
|
|
|
gr.Markdown( |
|
""" |
|
|
|
### Fine-grained prosody editor |
|
Once you've generated some speech, the following table will show the id of the prosody embedding used for each phoneme. |
|
A prosody embedding determines the prosody of the phoneme. |
|
The table not only shows the prosody embeddings that are used by default (the top predictions), but also two more likely prosody embeddings. |
|
|
|
In order to change the prosody of a phoneme, write a new prosody embedding id in the "Chosen prosody embeddings" column and press "Generate speech" again. |
|
You can use any number from 0-31, but the 2nd and 3rd predictions are more likely to give a fitting prosody. |
|
Based on your edit, new prosody embeddings will be generated for the phonemes after the edit. |
|
Thus, you can iteratively change the prosody by starting from the beginning of the utterance and working your through the utterance, making edits as you see fit. |
|
The prosody embeddings before your edit will remain the same as before, and will be copied to the "Chosen prosody embeddings" column. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
phoneme_preds_df = gr.Dataframe( |
|
headers=["Phoneme", "π₯ Top pred", "π₯ 2nd pred", "π₯ 3rd pred"], |
|
type="array", |
|
col_count=(4, "static"), |
|
) |
|
phoneme_edits_df = gr.Dataframe( |
|
headers=["Chosen prosody embeddings"], type="array", col_count=(1, "static") |
|
) |
|
|
|
button.click( |
|
inference, |
|
inputs=[ |
|
text_input, |
|
global_style_dropdown, |
|
alpha_slider, |
|
phoneme_preds_df, |
|
phoneme_edits_df, |
|
], |
|
outputs=[audio, phoneme_preds_df, phoneme_edits_df], |
|
) |
|
|
|
|
|
demo.launch() |
|
|