Blakus commited on
Commit
b8acae3
·
verified ·
1 Parent(s): b2be53b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -40
app.py CHANGED
@@ -1,77 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  from TTS.api import TTS
3
  from TTS.tts.configs.xtts_config import XttsConfig
4
  from TTS.tts.models.xtts import Xtts
5
  from TTS.utils.generic_utils import get_user_data_dir
6
- import os
 
 
7
  from huggingface_hub import hf_hub_download
 
 
8
 
9
- # Configuración de rutas y descarga del modelo
10
  repo_id = "Blakus/Pedro_Lab_XTTS"
11
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
12
  os.makedirs(local_dir, exist_ok=True)
13
-
14
  files_to_download = ["config.json", "model.pth", "vocab.json"]
15
  for file_name in files_to_download:
 
 
16
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
17
 
 
18
  config_path = os.path.join(local_dir, "config.json")
19
  checkpoint_path = os.path.join(local_dir, "model.pth")
20
  vocab_path = os.path.join(local_dir, "vocab.json")
21
 
22
- # Cargar el modelo XTTS
23
  config = XttsConfig()
24
  config.load_json(config_path)
 
25
  model = Xtts.init_from_config(config)
26
  model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False)
27
 
28
- def sintetizar_voz(texto, idioma, audio_referencia, usar_microfono, audio_microfono):
29
- if usar_microfono:
30
- audio_entrada = audio_microfono
31
- else:
32
- audio_entrada = audio_referencia
33
-
34
- # Lógica de síntesis de voz usando el modelo XTTS
35
- gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=audio_entrada, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
36
- out = model.inference(
37
- texto,
38
- language=idioma,
39
- gpt_cond_latent=gpt_cond_latent,
40
- speaker_embedding=speaker_embedding,
41
- repetition_penalty=5.0,
42
- temperature=0.75,
43
- )
44
-
45
- # Guardar el audio generado
46
- output_path = "output.wav"
47
- model.save_wav(wav=out["wav"], path=output_path)
48
-
49
- return output_path, f"Tiempo de generación: {out['inference_time']:.2f} segundos"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
51
  with gr.Blocks(theme=gr.themes.Base()) as demo:
52
  gr.Markdown("# Sintetizador de Voz XTTS")
53
 
54
  with gr.Row():
55
  with gr.Column():
56
- texto_entrada = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
57
- idioma = gr.Dropdown(label="Idioma", choices=config.languages, value="es")
58
- audio_referencia = gr.Audio(label="Audio de referencia", type="filepath")
59
- usar_microfono = gr.Checkbox(label="Usar micrófono")
60
- audio_microfono = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
61
 
62
- usar_microfono.change(fn=lambda x: gr.update(visible=x), inputs=[usar_microfono], outputs=[audio_microfono])
63
 
64
- boton_sintetizar = gr.Button("Sintetizar")
65
 
66
  with gr.Column():
67
- audio_salida = gr.Audio(label="Audio sintetizado")
68
  waveform = gr.Image(label="Forma de onda")
69
- metricas = gr.Textbox(label="Métricas")
70
 
71
- boton_sintetizar.click(
72
- sintetizar_voz,
73
- inputs=[texto_entrada, idioma, audio_referencia, usar_microfono, audio_microfono],
74
- outputs=[audio_salida, metricas]
75
  )
76
 
77
- demo.launch()
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
+
11
+ # Mantenemos la descarga de MeCab
12
+ os.system('python -m unidic download')
13
+
14
+ # Mantenemos el acuerdo de CPML
15
+ os.environ["COQUI_TOS_AGREED"] = "1"
16
+
17
+ import langid
18
+ import base64
19
+ import csv
20
+ from io import StringIO
21
+ import datetime
22
+ import re
23
+
24
  import gradio as gr
25
+ from scipy.io.wavfile import write
26
+ from pydub import AudioSegment
27
+
28
  from TTS.api import TTS
29
  from TTS.tts.configs.xtts_config import XttsConfig
30
  from TTS.tts.models.xtts import Xtts
31
  from TTS.utils.generic_utils import get_user_data_dir
32
+
33
+ HF_TOKEN = os.environ.get("HF_TOKEN")
34
+
35
  from huggingface_hub import hf_hub_download
36
+ import os
37
+ from TTS.utils.manage import get_user_data_dir
38
 
39
+ # Mantenemos la autenticación y descarga del modelo
40
  repo_id = "Blakus/Pedro_Lab_XTTS"
41
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
42
  os.makedirs(local_dir, exist_ok=True)
 
43
  files_to_download = ["config.json", "model.pth", "vocab.json"]
44
  for file_name in files_to_download:
45
+ print(f"Downloading {file_name} from {repo_id}")
46
+ local_file_path = os.path.join(local_dir, file_name)
47
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
48
 
49
+ # Cargamos configuración y modelo
50
  config_path = os.path.join(local_dir, "config.json")
51
  checkpoint_path = os.path.join(local_dir, "model.pth")
52
  vocab_path = os.path.join(local_dir, "vocab.json")
53
 
 
54
  config = XttsConfig()
55
  config.load_json(config_path)
56
+
57
  model = Xtts.init_from_config(config)
58
  model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False)
59
 
60
+ print("Modelo cargado en CPU")
61
+
62
+ # Mantenemos variables globales y funciones auxiliares
63
+ DEVICE_ASSERT_DETECTED = 0
64
+ DEVICE_ASSERT_PROMPT = None
65
+ DEVICE_ASSERT_LANG = None
66
+ supported_languages = config.languages
67
+
68
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
69
+ try:
70
+ if use_mic:
71
+ speaker_wav = mic_file_path
72
+ else:
73
+ speaker_wav = audio_file_pth
74
+
75
+ if len(prompt) < 2 or len(prompt) > 200:
76
+ return None, None, "El texto debe tener entre 2 y 200 caracteres."
77
+
78
+ gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
79
+
80
+ out = model.inference(
81
+ prompt,
82
+ language,
83
+ gpt_cond_latent,
84
+ speaker_embedding,
85
+ repetition_penalty=5.0,
86
+ temperature=0.75,
87
+ )
88
+
89
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
90
+
91
+ metrics_text = f"Tiempo de generación: {out['inference_time']:.2f} segundos\n"
92
+ metrics_text += f"Factor de tiempo real: {out['inference_time'] / (len(out['wav']) / 24000):.2f}"
93
+
94
+ return gr.make_waveform("output.wav"), "output.wav", metrics_text
95
+
96
+ except Exception as e:
97
+ print(f"Error: {str(e)}")
98
+ return None, None, f"Error: {str(e)}"
99
 
100
+ # Interfaz de Gradio simplificada
101
  with gr.Blocks(theme=gr.themes.Base()) as demo:
102
  gr.Markdown("# Sintetizador de Voz XTTS")
103
 
104
  with gr.Row():
105
  with gr.Column():
106
+ input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
107
+ language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es")
108
+ audio_file = gr.Audio(label="Audio de referencia", type="filepath")
109
+ use_mic = gr.Checkbox(label="Usar micrófono")
110
+ mic_file = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
111
 
112
+ use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file])
113
 
114
+ generate_button = gr.Button("Generar voz")
115
 
116
  with gr.Column():
117
+ output_audio = gr.Audio(label="Audio generado")
118
  waveform = gr.Image(label="Forma de onda")
119
+ metrics = gr.Textbox(label="Métricas")
120
 
121
+ generate_button.click(
122
+ predict,
123
+ inputs=[input_text, language, audio_file, mic_file, use_mic],
124
+ outputs=[waveform, output_audio, metrics]
125
  )
126
 
127
+ demo.launch(debug=True)