Spaces:
Paused
Paused
File size: 5,835 Bytes
6afdbbb 20e68c3 6afdbbb 72d4e57 20e68c3 6afdbbb a891cae c761f75 a891cae 72d4e57 6afdbbb 72d4e57 6afdbbb 72d4e57 c761f75 6afdbbb 72d4e57 20e68c3 72d4e57 c761f75 72d4e57 c761f75 72d4e57 576e1e2 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb a891cae 6afdbbb 72d4e57 20e68c3 72d4e57 20e68c3 6afdbbb 576e1e2 20e68c3 6afdbbb 20e68c3 6afdbbb 20e68c3 576e1e2 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb 72d4e57 6afdbbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import re
import tempfile
import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
from cached_path import cached_path
from transformers import AutoModelForCausalLM, AutoTokenizer
from num2words import num2words
try:
import spaces
USING_SPACES = True
except ImportError:
USING_SPACES = False
def gpu_decorator(func):
if USING_SPACES:
return spaces.GPU(func)
else:
return func
from f5_tts.model import DiT, UNetT
from f5_tts.infer.utils_infer import (
load_vocoder,
load_model,
preprocess_ref_audio_text,
infer_process,
remove_silence_for_generated_wav,
save_spectrogram,
)
vocoder = load_vocoder()
# Cargar modelos
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
F5TTS_ema_model = load_model(
DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
)
chat_model_state = None
chat_tokenizer_state = None
@gpu_decorator
def generate_response(messages, model, tokenizer):
"""Generar respuesta usando Qwen."""
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
def traducir_numero_a_texto(texto):
texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
def reemplazar_numero(match):
numero = match.group()
return num2words(int(numero), lang='es')
texto_traducido = re.sub(r'\b\d+\b', reemplazar_numero, texto_separado)
return texto_traducido
@gpu_decorator
def infer(
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
):
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
ema_model = F5TTS_ema_model
if not gen_text.startswith(" "):
gen_text = " " + gen_text
if not gen_text.endswith(". "):
gen_text += ". "
gen_text = gen_text.lower()
gen_text = traducir_numero_a_texto(gen_text)
final_wave, final_sample_rate, combined_spectrogram = infer_process(
ref_audio,
ref_text,
gen_text,
ema_model,
vocoder,
cross_fade_duration=cross_fade_duration,
speed=speed,
show_info=show_info,
progress=gr.Progress(),
)
# Remover silencios
if remove_silence:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, final_wave, final_sample_rate)
remove_silence_for_generated_wav(f.name)
final_wave, _ = torchaudio.load(f.name)
final_wave = final_wave.squeeze().cpu().numpy()
# Guardar espectrograma
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(combined_spectrogram, spectrogram_path)
return (final_sample_rate, final_wave), spectrogram_path
with gr.Blocks() as app_tts:
gr.Markdown("# TTS por Lotes")
ref_audio_input = gr.Audio(label="Audio de Referencia", type="filepath")
gen_text_input = gr.Textbox(label="Texto para Generar", lines=10)
model_choice = gr.Radio(choices=["F5-TTS"], label="Seleccionar Modelo TTS", value="F5-TTS")
generate_btn = gr.Button("Sintetizar", variant="primary")
with gr.Accordion("Configuraciones Avanzadas", open=False):
ref_text_input = gr.Textbox(
label="Texto de Referencia",
info="Deja en blanco para transcribir automáticamente el audio de referencia. Si ingresas texto, sobrescribirá la transcripción automática.",
lines=2,
)
remove_silence = gr.Checkbox(
label="Eliminar Silencios",
info="El modelo tiende a producir silencios, especialmente en audios más largos. Podemos eliminar manualmente los silencios si es necesario. Ten en cuenta que esta es una característica experimental y puede producir resultados extraños. Esto también aumentará el tiempo de generación.",
value=False,
)
speed_slider = gr.Slider(
label="Velocidad",
minimum=0.3,
maximum=2.0,
value=1.0,
step=0.1,
info="Ajusta la velocidad del audio.",
)
cross_fade_duration_slider = gr.Slider(
label="Duración del Cross-Fade (s)",
minimum=0.0,
maximum=1.0,
value=0.15,
step=0.01,
info="Establece la duración del cross-fade entre clips de audio.",
)
audio_output = gr.Audio(label="Audio Sintetizado")
spectrogram_output = gr.Image(label="Espectrograma")
generate_btn.click(
infer,
inputs=[
ref_audio_input,
ref_text_input,
gen_text_input,
model_choice,
remove_silence,
cross_fade_duration_slider,
speed_slider,
],
outputs=[audio_output, spectrogram_output],
)
with gr.Blocks() as app:
gr.Markdown(
"""
# Spanish-F5
Esta es una interfaz web para F5 TTS, con un finetuning para poder hablar en castellano.
"""
)
gr.TabbedInterface(
[app_tts],
["TTS"],
)
if __name__ == "__main__":
app.queue().launch()
|