File size: 4,300 Bytes
7ae4987 7f0cc16 53016e3 7f0cc16 53016e3 7514dcc 41b7aed 7514dcc b8920a0 b79ce49 51e62ea b79ce49 b8920a0 7514dcc aa00ca9 7514dcc b8920a0 7514dcc b79ce49 7f0cc16 2c3e79f 7f0cc16 f47653c 7f0cc16 f47653c 7f0cc16 f47653c 53016e3 7f0cc16 53016e3 7f0cc16 53016e3 7f0cc16 53016e3 7f0cc16 2c3e79f 7f0cc16 6cfa8d7 7f0cc16 6cfa8d7 2c3e79f 6cfa8d7 f47653c 7f0cc16 53016e3 7f0cc16 4e5413c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from engine import Piper
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
import json
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from espeak_phonemizer import Phonemizer
MAX_TXT_LEN = 100
SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
fonemitzador = Phonemizer("ca")
def carrega_bsc():
model_path = os.getcwd() + "/models/bsc/best_model.pth"
config_path = os.getcwd() + "/models/bsc/config.json"
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
vocoder_path = None
vocoder_config_path = None
synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
)
return synthesizer
def carrega_collectivat():
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
synthesizer = Synthesizer(
model_path, config_path, None, None, vocoder_path, vocoder_config_path
)
return synthesizer
def carrega_piper():
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
model_bsc = carrega_bsc()
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
model_collectivat = carrega_collectivat()
model_piper = carrega_piper()
def tts(text, speaker_idx):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
print(text)
# synthesize
wav_bsc = model_bsc.tts(text, speaker_idx)
wav_coll = model_collectivat.tts(text)
wav_piper = model_piper.synthesize(text)
#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
# return output
fp_bsc = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_bsc.save_wav(wav_bsc, fp)
fp_bsc = fp.name
fp_coll = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_collectivat.save_wav(wav_coll, fp)
fp_coll = fp.name
fp_piper = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
fp.write(wav_piper)
fp_piper = fp.name
fonemes = fonemitzador.phonemize(text)
return fonemes, fp_bsc, fp_coll, fp_piper
description="""
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models lliures pel català.
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina)
https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker
2. Model Fastspeech entrenat per Col·lectivat
https://github.com/CollectivaT-dev/TTS-API
3. Model VITS entrenat per Piper/Home Assistant
https://github.com/rhasspy/piper
Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT, que va servir com a base per a les veus catalanes de Festival
El primer model conté moltes veus de qualitat variable. Podeu sel·leccionar-ne una altre al desplegable. La veu d'Ona esta sel·leccionada per defecte per la comparativa.
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
https://github.com/projecte-aina/espeak-ng
"""
article= ""
iface = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
label="Text",
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
),
gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, value="ona")
],
outputs=[
gr.Markdown(label="Fonemes"),
gr.Audio(label="BSC VITS",type="filepath"),
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
gr.Audio(label="Piper VITS",type="filepath")
],
title="Comparativa de síntesi lliure en català️",
description=description,
article=article,
allow_flagging="never",
layout="vertical",
live=False
)
iface.launch(server_name="0.0.0.0", server_port=7860)
|