File size: 4,300 Bytes
7ae4987
7f0cc16
 
 
 
 
 
 
 
 
53016e3
7f0cc16
 
 
 
 
53016e3
 
7514dcc
 
 
 
 
 
 
 
41b7aed
7514dcc
 
 
 
b8920a0
 
 
 
 
 
 
 
 
 
 
b79ce49
51e62ea
b79ce49
b8920a0
7514dcc
aa00ca9
7514dcc
b8920a0
7514dcc
b79ce49
 
7f0cc16
 
 
 
 
 
2c3e79f
 
 
 
7f0cc16
f47653c
7f0cc16
 
f47653c
7f0cc16
f47653c
 
 
 
 
 
 
 
 
 
 
 
 
53016e3
 
 
7f0cc16
 
 
53016e3
 
 
 
 
 
 
 
 
 
 
 
7f0cc16
53016e3
7f0cc16
53016e3
 
7f0cc16
 
 
 
 
 
2c3e79f
7f0cc16
6cfa8d7
7f0cc16
6cfa8d7
2c3e79f
 
6cfa8d7
f47653c
 
 
7f0cc16
53016e3
7f0cc16
 
 
 
 
 
4e5413c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from engine import Piper
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
import json
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from espeak_phonemizer import Phonemizer

MAX_TXT_LEN = 100

SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']

fonemitzador = Phonemizer("ca")

def carrega_bsc():
    model_path = os.getcwd() + "/models/bsc/best_model.pth"
    config_path = os.getcwd() + "/models/bsc/config.json"
    speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
    vocoder_path = None
    vocoder_config_path = None

    synthesizer = Synthesizer(
        model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
    )

    return synthesizer

def carrega_collectivat():
    model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
    config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
    vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
    vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
    synthesizer = Synthesizer(
        model_path, config_path, None, None, vocoder_path, vocoder_config_path
    )

    return synthesizer

def carrega_piper():
    return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")


model_bsc = carrega_bsc()
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names

model_collectivat = carrega_collectivat()

model_piper = carrega_piper()

def tts(text, speaker_idx):
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
    print(text)

    # synthesize
    wav_bsc = model_bsc.tts(text, speaker_idx)
    wav_coll = model_collectivat.tts(text)
    wav_piper = model_piper.synthesize(text)

    #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)

    # return output
    fp_bsc = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        model_bsc.save_wav(wav_bsc, fp)
        fp_bsc = fp.name

    fp_coll = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        model_collectivat.save_wav(wav_coll, fp)
        fp_coll = fp.name

    fp_piper = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        fp.write(wav_piper)
        fp_piper = fp.name

    fonemes = fonemitzador.phonemize(text)

    return fonemes, fp_bsc, fp_coll, fp_piper


description="""
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models lliures pel català.

1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina)
https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker

2. Model Fastspeech entrenat per Col·lectivat
https://github.com/CollectivaT-dev/TTS-API

3. Model VITS entrenat per Piper/Home Assistant
https://github.com/rhasspy/piper

Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT, que va servir com a base per a les veus catalanes de Festival

El primer model conté moltes veus de qualitat variable. Podeu sel·leccionar-ne una altre al desplegable. La veu d'Ona esta sel·leccionada per defecte per la comparativa.

Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
https://github.com/projecte-aina/espeak-ng
"""
article= ""

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            label="Text",
            value="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
        ),
        gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, value="ona")
    ],
    outputs=[
        gr.Markdown(label="Fonemes"),
        gr.Audio(label="BSC VITS",type="filepath"),
        gr.Audio(label="Collectivat Fastspeech",type="filepath"),
        gr.Audio(label="Piper VITS",type="filepath")
    ],
    title="Comparativa de síntesi lliure en català️",
    description=description,
    article=article,
    allow_flagging="never",
    layout="vertical",
    live=False
)
iface.launch(server_name="0.0.0.0", server_port=7860)