File size: 5,830 Bytes
c35d162
30c3950
d3b5ad0
30c3950
 
 
 
 
 
 
 
 
 
 
 
06435a9
 
c7920d9
06435a9
c7920d9
 
 
 
06435a9
c7920d9
 
 
06435a9
c7920d9
 
30c3950
0ebf05c
c7920d9
 
 
 
 
468533f
c7920d9
 
 
 
 
 
468533f
 
c7920d9
 
 
 
 
 
468533f
30c3950
 
 
 
 
 
 
06435a9
 
 
 
 
 
 
f272b23
5a320c5
f272b23
 
 
 
 
06435a9
5cf39a1
a4a82c5
06435a9
d06d609
a3d1f8d
5cf39a1
 
 
 
a3d1f8d
5cf39a1
 
 
b9a1bad
 
6110406
1f2b8fe
b9a1bad
 
 
 
 
 
a5eeb17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7920d9
 
5cf39a1
 
6110406
 
038f0e8
 
1f2b8fe
468533f
 
 
 
 
 
 
 
 
0ebf05c
 
6110406
 
b9a1bad
 
c7920d9
a5eeb17
5cf39a1
6110406
 
 
468533f
 
 
 
 
6110406
 
30c3950
a5eeb17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols as symbols_default
from text.symbols_pho import symbols_pho
from scipy.io.wavfile import write
from text import cleaners

model_configs = {
    "Phonemes_finetuned": {
        "path": "fr_wa_finetuned_pho/G_125000.pth",
        "symbols": symbols_default
    },
    "Phonemes": {
        "path": "wallon_pho/G_277000.pth",
        "symbols": symbols_pho
    }
}

# Global variables 
net_g = None
symbols = []
_symbol_to_id = {}
_id_to_symbol = {}

def text_to_sequence(text, cleaner_names):
    sequence = []
    clean_text = _clean_text(text, cleaner_names)
    for symbol in clean_text:
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
    return sequence

def _clean_text(text, cleaner_names):
    for name in cleaner_names:
        cleaner = getattr(cleaners, name)
        if not cleaner:
            raise Exception('Unknown cleaner: %s' % name)
        text = cleaner(text)
    return text

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_model_and_symbols(tab_name):
    global net_g, symbols, _symbol_to_id, _id_to_symbol
    model_config = model_configs[tab_name]
    symbols = model_config["symbols"]
    _symbol_to_id = {s: i for i, s in enumerate(symbols)}
    _id_to_symbol = {i: s for i, s in enumerate(symbols)}

    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model)
    _ = net_g.eval()
    _ = utils.load_checkpoint(model_config["path"], net_g, None)

def tts(text, speaker_id, tab_name):
    load_model_and_symbols(tab_name)
    sid = torch.LongTensor([speaker_id])  # speaker identity
    stn_tst = get_text(text, hps)

    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
            0, 0].data.float().numpy()
    return "Success", (hps.data.sampling_rate, audio)

def create_tab(tab_name):
    with gr.TabItem(tab_name):
        gr.Markdown(f"### {tab_name} TTS Model")
        tts_input1 = gr.TextArea(label="Text in Walloon on IPA phonemes", value="")
        tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
        tts_submit = gr.Button("Generate", variant="primary")
        tts_output1 = gr.Textbox(label="Message")
        tts_output2 = gr.Audio(label="Output")
        tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2])

def tts_comparison(text, speaker_id):
    result1 = tts(text, speaker_id, "Phonemes_finetuned")
    result2 = tts(text, speaker_id, "Phonemes")
    return result1[1], result2[1]

def create_comparison_tab():
    with gr.TabItem("Compare Models"):
        gr.Markdown("### Compare TTS Models")
        tts_input = gr.TextArea(label="Text in Walloon on IPA phonemes", value="")
        tts_speaker = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
        tts_submit = gr.Button("Generate", variant="primary")
        tts_output1 = gr.Audio(label="Phonemes Finetuned Output")
        tts_output2 = gr.Audio(label="Phonemes Output")
        tts_submit.click(lambda text, speaker_id: tts_comparison(text, speaker_id), [tts_input, tts_speaker], [tts_output1, tts_output2])

hps = utils.get_hparams_from_file("configs/vctk_base.json")

app = gr.Blocks()
with app:
    gr.Markdown(
        """
        # First Text to Speech (TTS) for Walloon
        Based on VITS (https://github.com/jaywalnut310/vits).
        Write the text in phonemes or graphemes depending on the model.
        For faster inference, it is recommended to use short sentences.
        
        The quality of the results varies between male and female voice due to the limited data for female voice on this language. 
        For better results with male voice, use the models fully trained on Walloon.
        For better results with female voice, use the models trained on french and fine-tuned on Walloon.
        
        To try the version trained in graphemes follow the link below:
        https://huggingface.co/spaces/Pipe1213/VITS_Walloon_Graphemes
        
        ### Hint: Some sample texts are available at the bottom of the web site.
        ### Hint: For faster inference speed it is recommended to use short sentences.
        """
    )
    with gr.Tabs():
        create_tab("Phonemes_finetuned")
        create_tab("Phonemes")
        create_comparison_tab()

    gr.Markdown(
        """
        ### Examples
        | Input Text | Speaker |
        |------------|---------|
        | li biːç ɛ l sɔlja ɛstẽ ki s maʁɡajẽ pɔ sawɛ kiː ski , dɛ døː , ɛstøː l py fwaʁ . m ɛ̃ s koː la , la k i vɛjɛ õ tsminɔː k aʁivef pjim pjam , d ɛ̃ õ bja nuː tsoː paltɔ .  | Female |
        | ɛl m õ ʁɛspõdu ,  duvẽ ɔːʁẽ n pøː d õ tsapja . | Male |
        | dɔ koː , dz a dvu tswɛzi ɛn oːt mɛstiː , dz ast apʁ ɛ̃ a mõne dɛz avjõ .| Female |
        """
    )

app.launch()