File size: 4,747 Bytes
7f0cc16 53016e3 c19313c 7f0cc16 8bb6a40 7f0cc16 53016e3 7514dcc 41b7aed 7514dcc b8920a0 b79ce49 51e62ea b79ce49 b8920a0 7514dcc aa00ca9 7514dcc b8920a0 7514dcc b79ce49 c19313c 7f0cc16 2c3e79f 7f0cc16 f47653c 7f0cc16 f47653c 7f0cc16 f47653c cd62b9b 53016e3 c19313c 7f0cc16 2760abb 53016e3 2760abb 53016e3 8bb6a40 7f0cc16 53016e3 3729983 2760abb 7f0cc16 2c3e79f 7f0cc16 6cfa8d7 7f0cc16 c19313c 2c3e79f 6cfa8d7 c19313c f47653c 7f0cc16 53016e3 7f0cc16 4e5413c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from espeak_phonemizer import Phonemizer
from engine import Piper
from festival import festival_synthesize
MAX_TXT_LEN = 325
fonemitzador = Phonemizer("ca")
def carrega_bsc():
model_path = os.getcwd() + "/models/bsc/best_model.pth"
config_path = os.getcwd() + "/models/bsc/config.json"
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
vocoder_path = None
vocoder_config_path = None
synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
)
return synthesizer
def carrega_collectivat():
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
synthesizer = Synthesizer(
model_path, config_path, None, None, vocoder_path, vocoder_config_path
)
return synthesizer
def carrega_piper():
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
model_bsc = carrega_bsc()
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
model_collectivat = carrega_collectivat()
model_piper = carrega_piper()
def tts(text, festival_voice, speaker_idx):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
print(text)
# synthesize
wav_bsc = model_bsc.tts(text, speaker_idx)
wav_coll = model_collectivat.tts(text)
wav_piper = model_piper.synthesize(text)
#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
# return output
fp_bsc = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_bsc.save_wav(wav_bsc, fp)
fp_bsc = fp.name
fp_coll = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_collectivat.save_wav(wav_coll, fp)
fp_coll = fp.name
fp_piper = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
fp.write(wav_piper)
fp_piper = fp.name
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
fp_festival = festival_synthesize(text, festival_voice)
return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper
description="""
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català i amb el motor Festival.
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
https://github.com/projecte-aina/espeak-ng
NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador. Festival conté les seves pròpies normes fonètiques.
"""
article= ""
iface = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
label="Text",
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
),
gr.Dropdown(label="Parlant del motor Festival", choices=["ona", "pau"], value="ona"),
gr.Dropdown(label="Parlant del model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona")
],
outputs=[
gr.Markdown(label="Fonemes"),
gr.Audio(label="Festival",type="filepath"),
gr.Audio(label="BSC VITS",type="filepath"),
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
gr.Audio(label="Piper VITS",type="filepath")
],
title="Comparativa de síntesi lliure en català️",
description=description,
article=article,
allow_flagging="never",
layout="vertical",
live=False
)
iface.launch(server_name="0.0.0.0", server_port=7860)
|