Spaces:
Runtime error
Runtime error
import tempfile | |
from typing import Optional | |
from TTS.config import load_config | |
import gradio as gr | |
import numpy as np | |
import os | |
from TTS.utils.manage import ModelManager | |
from TTS.utils.synthesizer import Synthesizer | |
from espeak_phonemizer import Phonemizer | |
from engine import Piper | |
from festival import festival_synthesize | |
MAX_TXT_LEN = 325 | |
fonemitzador = Phonemizer("ca") | |
def carrega_bsc(): | |
model_path = os.getcwd() + "/models/bsc/best_model.pth" | |
config_path = os.getcwd() + "/models/bsc/config.json" | |
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth" | |
vocoder_path = None | |
vocoder_config_path = None | |
synthesizer = Synthesizer( | |
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, | |
) | |
return synthesizer | |
def carrega_collectivat(): | |
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth" | |
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json" | |
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth" | |
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json" | |
synthesizer = Synthesizer( | |
model_path, config_path, None, None, vocoder_path, vocoder_config_path | |
) | |
return synthesizer | |
def carrega_piper(): | |
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx") | |
model_bsc = carrega_bsc() | |
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names | |
model_collectivat = carrega_collectivat() | |
model_piper = carrega_piper() | |
def tts(text, festival_voice, speaker_idx): | |
if len(text) > MAX_TXT_LEN: | |
text = text[:MAX_TXT_LEN] | |
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") | |
print(text) | |
# synthesize | |
wav_bsc = model_bsc.tts(text, speaker_idx) | |
wav_coll = model_collectivat.tts(text) | |
wav_piper = model_piper.synthesize(text) | |
#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper) | |
# return output | |
fp_bsc = "" | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
model_bsc.save_wav(wav_bsc, fp) | |
fp_bsc = fp.name | |
fp_coll = "" | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
model_collectivat.save_wav(wav_coll, fp) | |
fp_coll = fp.name | |
fp_piper = "" | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
fp.write(wav_piper) | |
fp_piper = fp.name | |
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True) | |
fp_festival = festival_synthesize(text, festival_voice) | |
return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper | |
description=""" | |
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català i amb el motor Festival. | |
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker) | |
2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API) | |
3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper) | |
Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres. | |
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC | |
https://github.com/projecte-aina/espeak-ng | |
NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador. Festival conté les seves pròpies normes fonètiques. | |
""" | |
article= "" | |
iface = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Textbox( | |
label="Text", | |
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.", | |
), | |
gr.Dropdown(label="Parlant del motor Festival", choices=["ona", "pau"], value="ona"), | |
gr.Dropdown(label="Parlant del model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona") | |
], | |
outputs=[ | |
gr.Markdown(label="Fonemes"), | |
gr.Audio(label="Festival",type="filepath"), | |
gr.Audio(label="BSC VITS",type="filepath"), | |
gr.Audio(label="Collectivat Fastspeech",type="filepath"), | |
gr.Audio(label="Piper VITS",type="filepath") | |
], | |
title="Comparativa de síntesi lliure en català️", | |
description=description, | |
article=article, | |
allow_flagging="never", | |
layout="vertical", | |
live=False | |
) | |
iface.launch(server_name="0.0.0.0", server_port=7860) | |