Spaces:
Sleeping
Sleeping
File size: 7,202 Bytes
c3bdc3d 132e1e7 e1ee9fa 5f502e0 c3bdc3d ce0b7be 6eb292b 132e1e7 fa58d3f 132e1e7 5f502e0 fa9217c 5f502e0 e675795 5f502e0 02d9c21 5f502e0 132e1e7 ce0b7be 132e1e7 ce0b7be 132e1e7 ce0b7be c3bdc3d a78c2eb c3bdc3d 132e1e7 c3bdc3d 132e1e7 c3bdc3d 132e1e7 c3bdc3d 132e1e7 c3bdc3d 3f392ee c3bdc3d a78c2eb 9b19b13 02d9c21 9b19b13 a78c2eb 336cd1f 9b19b13 336cd1f 9b19b13 a78c2eb 132e1e7 c3bdc3d 687219d 0893c84 687219d c3bdc3d 68e04c6 528ce6c 9b19b13 02d9c21 a78c2eb c3bdc3d 0893c84 c3bdc3d a78c2eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols as symbols_default # import symbols graphemes
from fr_wa_graphemes.symbols import symbols as symbols_ft # import symbols finetuned
from wa_ft_graphemes_v2.symbols import symbols as symbols_ft_v2
from wa_graphemes_v2.symbols import symbols as symbols_v2
from scipy.io.wavfile import write
from text import cleaners
model_configs = {
"Graphemes_finetuned": {
"path": "fr_wa_graphemes/G_80000.pth",
"symbols": symbols_ft
},
"Graphemes": {
"path": "wa_graphemes/G_258000.pth",
"symbols": symbols_default
},
"Graphemes_v2": {
"path": "wa_graphemes_v2/G_228000.pth",
"symbols": symbols_v2
},
"Graphemes_finetuned_v2": {
"path": "wa_ft_graphemes_v2/G_207000.pth",
"symbols": symbols_ft_v2
},
"Graphemes_v3": {
"path": "wa_graphemes_v3/G_394000.pth",
"symbols": symbols_v2
},
"Graphemes_finetuned_v3": {
"path": "wa_ft_v3/G_213000.pth",
"symbols": symbols_ft_v2
}
}
# Global variables
net_g = None
symbols = []
_symbol_to_id = {}
_id_to_symbol = {}
def text_to_sequence(text, cleaner_names):
sequence = []
clean_text = _clean_text(text, cleaner_names)
for symbol in clean_text:
symbol_id = _symbol_to_id[symbol]
sequence += [symbol_id]
return sequence
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if (hps.data.add_blank):
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def load_model_and_symbols(tab_name):
global net_g, symbols, _symbol_to_id, _id_to_symbol
model_config = model_configs[tab_name]
symbols = model_config["symbols"]
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint(model_config["path"], net_g, None)
def tts(text, speaker_id, tab_name):
load_model_and_symbols(tab_name)
sid = torch.LongTensor([speaker_id]) # speaker identity
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
0, 0].data.float().numpy()
return "Success", (hps.data.sampling_rate, audio)
def create_tab(tab_name):
with gr.TabItem(tab_name):
gr.Markdown(f"### {tab_name} TTS Model")
tts_input1 = gr.TextArea(label="Text in Walloon", value="")
tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Textbox(label="Message")
tts_output2 = gr.Audio(label="Output")
tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2])
def tts_comparison(text, speaker_id):
result1 = tts(text, speaker_id, "Graphemes")
result2 = tts(text, speaker_id, "Graphemes_finetuned")
#result3 = tts(text, speaker_id, "Graphemes_v2")
#result4 = tts(text, speaker_id, "Graphemes_finetuned_v2")
result5 = tts(text, speaker_id, "Graphemes_v3")
result6 = tts(text, speaker_id, "Graphemes_finetuned_v3")
return result1[1], result2[1], result5[1], result6[1] #result3[1], result4[1],
def create_comparison_tab():
with gr.TabItem("Compare Models"):
gr.Markdown("### Compare TTS Models")
tts_input = gr.TextArea(label="Text in Walloon", value="")
tts_speaker = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Audio(label="Graphemes")
tts_output2 = gr.Audio(label="Graphemes French Data")
#tts_output3 = gr.Audio(label="Graphemes v2")
#tts_output4 = gr.Audio(label="Graphemes v2 French Data")
tts_output5 = gr.Audio(label="Graphemes v3")
tts_output6 = gr.Audio(label="Graphemes v3 French Data")
tts_submit.click(lambda text, speaker_id: tts_comparison(text, speaker_id), [tts_input, tts_speaker], [tts_output1, tts_output2,tts_output5, tts_output6]) #tts_output3, tts_output4
hps = utils.get_hparams_from_file("configs/vctk_base.json")
app = gr.Blocks()
with app:
gr.Markdown(
"""
# First Text to Speech (TTS) for Walloon
Based on VITS (https://github.com/jaywalnut310/vits).
## How to use:
Write the text in graphemes. For faster inference, it is recommended to use short sentences.
The quality of the results varies between male and female voice due to the limited data for female voice on this language.
For better results with male voice, use the models fully trained on Walloon.
For better results with female voice, use the models trained on french and fine-tuned on Walloon.
To try the version trained in phonemes follow the link below:
https://huggingface.co/spaces/Pipe1213/VITS_Walloon_Phonemes
## Hint: Some sample texts are available at the bottom of the web site.
"""
)
with gr.Tabs():
create_tab("Graphemes")
create_tab("Graphemes_finetuned")
#create_tab("Graphemes_v2")
#create_tab("Graphemes_finetuned_v2")
create_tab("Graphemes_v3")
create_tab("Graphemes_finetuned_v3")
create_comparison_tab()
gr.Markdown(
"""
## Examples
| Input Text | Speaker |
|------------|---------|
| Portant, c' est l' seu ki n' doereut nén fé rire di lu, a mi idêye. | Female |
| Li bijhe et l’ solea estént ki s’ margayént po sawè kî çki, des deus, esteut l’ pus foirt. Mins ç’ côp la, la k’ i veyèt on tchminåd k' arivéve pyim piam, dins on bea noû tchôd paltot. | Male |
| Ci fourit co l' bedot les cåzes ca, a on moumint, li Ptit Prince mi dmanda yåk, come onk k' est so dotance, tot d' on côp | Female |
| Li Ptit Prince, da Antoenne di Sint-Spuri, ratourné e walon pa Lorint Enchel | Female |
"""
)
app.launch()
|