fix
Browse files- __pycache__/sentence_analyzer.cpython-310.pyc +0 -0
- app.py +57 -51
- logs/sentence_analyzer_2024-12-02.log +2 -0
- samples/country.flac +0 -0
- samples/main.flac +0 -0
- samples/story.toml +0 -19
- samples/story.txt +0 -1
- samples/town.flac +0 -0
__pycache__/sentence_analyzer.cpython-310.pyc
ADDED
Binary file (7.74 kB). View file
|
|
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import nltk
|
3 |
nltk.download('punkt_tab')
|
4 |
from sentence_analyzer import SentenceAnalyzer
|
@@ -6,7 +5,6 @@ import re
|
|
6 |
import tempfile
|
7 |
from collections import OrderedDict
|
8 |
from importlib.resources import files
|
9 |
-
|
10 |
import click
|
11 |
import gradio as gr
|
12 |
import numpy as np
|
@@ -17,34 +15,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
17 |
|
18 |
try:
|
19 |
import spaces
|
20 |
-
|
21 |
USING_SPACES = True
|
22 |
except ImportError:
|
23 |
USING_SPACES = False
|
24 |
|
25 |
-
|
26 |
def gpu_decorator(func):
|
27 |
if USING_SPACES:
|
28 |
return spaces.GPU(func)
|
29 |
else:
|
30 |
return func
|
31 |
|
32 |
-
|
33 |
-
from f5_tts.
|
34 |
-
from f5_tts.infer.utils_infer import (
|
35 |
-
load_vocoder,
|
36 |
-
load_model,
|
37 |
-
preprocess_ref_audio_text,
|
38 |
-
infer_process,
|
39 |
-
remove_silence_for_generated_wav,
|
40 |
-
save_spectrogram,
|
41 |
-
)
|
42 |
-
|
43 |
-
# Carregar vocoder
|
44 |
-
vocoder = load_vocoder()
|
45 |
|
46 |
import os
|
47 |
from huggingface_hub import hf_hub_download
|
|
|
48 |
def load_f5tts():
|
49 |
# Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
|
50 |
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
|
@@ -55,16 +41,32 @@ def load_f5tts():
|
|
55 |
raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
|
56 |
# Faz o download do modelo do repositório privado
|
57 |
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
|
58 |
-
|
|
|
59 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
60 |
-
return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
|
61 |
|
62 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
F5TTS_ema_model = load_f5tts()
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
@gpu_decorator
|
66 |
def infer(
|
67 |
-
ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info
|
68 |
):
|
69 |
print(nfe)
|
70 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
@@ -80,6 +82,7 @@ def infer(
|
|
80 |
speed=speed,
|
81 |
show_info=show_info,
|
82 |
progress=gr.Progress(),
|
|
|
83 |
)
|
84 |
# Remover silêncios
|
85 |
if remove_silence:
|
@@ -92,7 +95,8 @@ def infer(
|
|
92 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
|
93 |
spectrogram_path = tmp_spectrogram.name
|
94 |
save_spectrogram(combined_spectrogram, spectrogram_path)
|
95 |
-
return (final_sample_rate, final_wave), spectrogram_path, ref_text
|
|
|
96 |
|
97 |
# Estilos CSS
|
98 |
custom_css = """
|
@@ -115,7 +119,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
115 |
with gr.Tabs():
|
116 |
with gr.Tab("TTS Básico"):
|
117 |
gr.Markdown("# TTS Básico com F5-TTS")
|
118 |
-
|
119 |
# Entradas básicas
|
120 |
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
|
121 |
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
|
@@ -142,7 +146,6 @@ with gr.Blocks(css=custom_css) as app:
|
|
142 |
step=0.1,
|
143 |
info="Ajuste a velocidade do áudio.",
|
144 |
)
|
145 |
-
|
146 |
cross_fade_duration_slider = gr.Slider(
|
147 |
label="Duração do Cross-fade (s)",
|
148 |
minimum=0.0,
|
@@ -167,9 +170,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
167 |
step=1,
|
168 |
info="Ajuste NFE Step.",
|
169 |
)
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
174 |
analyzer = SentenceAnalyzer()
|
175 |
|
@@ -183,20 +184,21 @@ with gr.Blocks(css=custom_css) as app:
|
|
183 |
speed_slider,
|
184 |
nfe_slider,
|
185 |
chunk_size_slider,
|
|
|
186 |
):
|
187 |
# Dividir o texto em sentenças
|
188 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
189 |
-
|
190 |
# Agrupar sentenças em chunks
|
191 |
chunks = [
|
192 |
" ".join(sentences[i : i + chunk_size_slider])
|
193 |
for i in range(0, len(sentences), chunk_size_slider)
|
194 |
]
|
195 |
-
|
196 |
# Processar cada chunk
|
197 |
audio_segments = []
|
198 |
for chunk in chunks:
|
199 |
-
audio_out, spectrogram_path, ref_text_out = infer(
|
200 |
ref_audio_input,
|
201 |
ref_text_input, # Utiliza o Texto de Referência como está
|
202 |
chunk, # Processa o chunk atual
|
@@ -204,10 +206,11 @@ with gr.Blocks(css=custom_css) as app:
|
|
204 |
cross_fade_duration_slider,
|
205 |
speed_slider,
|
206 |
nfe_slider,
|
|
|
207 |
)
|
208 |
sr, audio_data = audio_out
|
209 |
audio_segments.append(audio_data)
|
210 |
-
|
211 |
# Concatenar os segmentos de áudio gerados
|
212 |
if audio_segments:
|
213 |
final_audio_data = np.concatenate(audio_segments)
|
@@ -215,16 +218,17 @@ with gr.Blocks(css=custom_css) as app:
|
|
215 |
(sr, final_audio_data), # Áudio final
|
216 |
spectrogram_path, # Espectrograma
|
217 |
gr.update(value=ref_text_out), # Nenhuma mudança no Texto de Referência
|
|
|
218 |
)
|
219 |
else:
|
220 |
gr.Warning("Nenhum áudio gerado.")
|
221 |
-
return None, None, gr.update()
|
222 |
|
223 |
-
|
224 |
# Saídas
|
225 |
gr.Markdown("### Resultados")
|
226 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
227 |
spectrogram_output = gr.Image(label="Espectrograma")
|
|
|
228 |
|
229 |
# Associação do botão `generate_btn` à função `process_chunks`
|
230 |
generate_btn.click(
|
@@ -238,34 +242,36 @@ with gr.Blocks(css=custom_css) as app:
|
|
238 |
speed_slider,
|
239 |
nfe_slider,
|
240 |
chunk_size_slider,
|
|
|
241 |
],
|
242 |
outputs=[
|
243 |
audio_output,
|
244 |
spectrogram_output,
|
245 |
ref_text_input, # Atualiza o texto de referência, se necessário
|
|
|
246 |
],
|
247 |
)
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
@click.
|
252 |
-
@click.option(
|
253 |
-
"--
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
264 |
|
265 |
if __name__ == "__main__":
|
266 |
if not USING_SPACES:
|
267 |
main()
|
268 |
else:
|
269 |
app.queue().launch()
|
270 |
-
|
271 |
-
|
|
|
|
|
1 |
import nltk
|
2 |
nltk.download('punkt_tab')
|
3 |
from sentence_analyzer import SentenceAnalyzer
|
|
|
5 |
import tempfile
|
6 |
from collections import OrderedDict
|
7 |
from importlib.resources import files
|
|
|
8 |
import click
|
9 |
import gradio as gr
|
10 |
import numpy as np
|
|
|
15 |
|
16 |
try:
|
17 |
import spaces
|
|
|
18 |
USING_SPACES = True
|
19 |
except ImportError:
|
20 |
USING_SPACES = False
|
21 |
|
|
|
22 |
def gpu_decorator(func):
|
23 |
if USING_SPACES:
|
24 |
return spaces.GPU(func)
|
25 |
else:
|
26 |
return func
|
27 |
|
28 |
+
# Importando a nova API F5TTS
|
29 |
+
from f5_tts.api import F5TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
import os
|
32 |
from huggingface_hub import hf_hub_download
|
33 |
+
|
34 |
def load_f5tts():
|
35 |
# Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
|
36 |
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
|
|
|
41 |
raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
|
42 |
# Faz o download do modelo do repositório privado
|
43 |
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
|
44 |
+
|
45 |
+
# Define as configurações do modelo (ajuste se necessário)
|
46 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
|
|
47 |
|
48 |
+
# Retorna a instância da API F5TTS
|
49 |
+
return F5TTS(
|
50 |
+
model_type="F5TTS_Base", # Ajuste o nome do modelo se necessário
|
51 |
+
ckpt_file=ckpt_path,
|
52 |
+
vocab_file=os.path.join(os.path.dirname(ckpt_path), "vocab.txt"), # Caminho para o arquivo vocab.txt
|
53 |
+
device="cuda" if torchaudio.cuda.is_available() else "cpu", # Define o dispositivo
|
54 |
+
use_ema=True
|
55 |
+
)
|
56 |
+
|
57 |
+
# Carregar modelo F5TTS usando a nova API
|
58 |
F5TTS_ema_model = load_f5tts()
|
59 |
|
60 |
+
# Variáveis globais para o cache
|
61 |
+
last_checkpoint = None
|
62 |
+
last_device = None
|
63 |
+
last_ema = None
|
64 |
+
tts_api = None
|
65 |
+
training_process = None # Adicione esta linha se necessário para o seu contexto
|
66 |
+
|
67 |
@gpu_decorator
|
68 |
def infer(
|
69 |
+
ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info, seed=-1
|
70 |
):
|
71 |
print(nfe)
|
72 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
|
|
82 |
speed=speed,
|
83 |
show_info=show_info,
|
84 |
progress=gr.Progress(),
|
85 |
+
seed=seed # Passando o seed para infer_process
|
86 |
)
|
87 |
# Remover silêncios
|
88 |
if remove_silence:
|
|
|
95 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
|
96 |
spectrogram_path = tmp_spectrogram.name
|
97 |
save_spectrogram(combined_spectrogram, spectrogram_path)
|
98 |
+
return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed # Retornando o seed
|
99 |
+
|
100 |
|
101 |
# Estilos CSS
|
102 |
custom_css = """
|
|
|
119 |
with gr.Tabs():
|
120 |
with gr.Tab("TTS Básico"):
|
121 |
gr.Markdown("# TTS Básico com F5-TTS")
|
122 |
+
|
123 |
# Entradas básicas
|
124 |
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
|
125 |
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
|
|
|
146 |
step=0.1,
|
147 |
info="Ajuste a velocidade do áudio.",
|
148 |
)
|
|
|
149 |
cross_fade_duration_slider = gr.Slider(
|
150 |
label="Duração do Cross-fade (s)",
|
151 |
minimum=0.0,
|
|
|
170 |
step=1,
|
171 |
info="Ajuste NFE Step.",
|
172 |
)
|
173 |
+
seed_input = gr.Number(label="Seed", value=-1, minimum=-1) # Seed na seção avançada
|
|
|
|
|
174 |
|
175 |
analyzer = SentenceAnalyzer()
|
176 |
|
|
|
184 |
speed_slider,
|
185 |
nfe_slider,
|
186 |
chunk_size_slider,
|
187 |
+
seed_input, # Passando o seed para process_chunks
|
188 |
):
|
189 |
# Dividir o texto em sentenças
|
190 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
191 |
+
|
192 |
# Agrupar sentenças em chunks
|
193 |
chunks = [
|
194 |
" ".join(sentences[i : i + chunk_size_slider])
|
195 |
for i in range(0, len(sentences), chunk_size_slider)
|
196 |
]
|
197 |
+
|
198 |
# Processar cada chunk
|
199 |
audio_segments = []
|
200 |
for chunk in chunks:
|
201 |
+
audio_out, spectrogram_path, ref_text_out, seed_output = infer( # Recebendo o seed de infer
|
202 |
ref_audio_input,
|
203 |
ref_text_input, # Utiliza o Texto de Referência como está
|
204 |
chunk, # Processa o chunk atual
|
|
|
206 |
cross_fade_duration_slider,
|
207 |
speed_slider,
|
208 |
nfe_slider,
|
209 |
+
seed=seed_input, # Passando o seed para infer
|
210 |
)
|
211 |
sr, audio_data = audio_out
|
212 |
audio_segments.append(audio_data)
|
213 |
+
|
214 |
# Concatenar os segmentos de áudio gerados
|
215 |
if audio_segments:
|
216 |
final_audio_data = np.concatenate(audio_segments)
|
|
|
218 |
(sr, final_audio_data), # Áudio final
|
219 |
spectrogram_path, # Espectrograma
|
220 |
gr.update(value=ref_text_out), # Nenhuma mudança no Texto de Referência
|
221 |
+
seed_output # Retornando o seed
|
222 |
)
|
223 |
else:
|
224 |
gr.Warning("Nenhum áudio gerado.")
|
225 |
+
return None, None, gr.update(), None # Retornando None para o seed
|
226 |
|
|
|
227 |
# Saídas
|
228 |
gr.Markdown("### Resultados")
|
229 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
230 |
spectrogram_output = gr.Image(label="Espectrograma")
|
231 |
+
seed_output = gr.Text(label="Seed usada:") # Saída do Seed
|
232 |
|
233 |
# Associação do botão `generate_btn` à função `process_chunks`
|
234 |
generate_btn.click(
|
|
|
242 |
speed_slider,
|
243 |
nfe_slider,
|
244 |
chunk_size_slider,
|
245 |
+
seed_input, # Passando o seed como entrada
|
246 |
],
|
247 |
outputs=[
|
248 |
audio_output,
|
249 |
spectrogram_output,
|
250 |
ref_text_input, # Atualiza o texto de referência, se necessário
|
251 |
+
seed_output, # Saída do Seed
|
252 |
],
|
253 |
)
|
254 |
+
|
255 |
+
|
256 |
+
# Código para iniciar a aplicação Gradio
|
257 |
+
@click.command()
|
258 |
+
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
259 |
+
@click.option("--host", "-H", default=None, help="Host to run the app on")
|
260 |
+
@click.option(
|
261 |
+
"--share",
|
262 |
+
"-s",
|
263 |
+
default=False,
|
264 |
+
is_flag=True,
|
265 |
+
help="Share the app via Gradio share link",
|
266 |
+
)
|
267 |
+
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
|
268 |
+
def main(port, host, share, api):
|
269 |
+
global app
|
270 |
+
print("Starting app...")
|
271 |
+
app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
if not USING_SPACES:
|
275 |
main()
|
276 |
else:
|
277 |
app.queue().launch()
|
|
|
|
logs/sentence_analyzer_2024-12-02.log
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
2024-12-02 18:27:53,692 - SentenceAnalyzer - DEBUG - Logger set up successfully
|
2 |
+
2024-12-02 18:27:53,692 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
|
samples/country.flac
DELETED
Binary file (180 kB)
|
|
samples/main.flac
DELETED
Binary file (279 kB)
|
|
samples/story.toml
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
# F5-TTS | E2-TTS
|
2 |
-
model = "F5-TTS"
|
3 |
-
ref_audio = "samples/main.flac"
|
4 |
-
# If an empty "", transcribes the reference audio automatically.
|
5 |
-
ref_text = ""
|
6 |
-
gen_text = ""
|
7 |
-
# File with text to generate. Ignores the text above.
|
8 |
-
gen_file = "samples/story.txt"
|
9 |
-
remove_silence = true
|
10 |
-
output_dir = "samples"
|
11 |
-
|
12 |
-
[voices.town]
|
13 |
-
ref_audio = "samples/town.flac"
|
14 |
-
ref_text = ""
|
15 |
-
|
16 |
-
[voices.country]
|
17 |
-
ref_audio = "samples/country.flac"
|
18 |
-
ref_text = ""
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
samples/story.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”
|
|
|
|
samples/town.flac
DELETED
Binary file (229 kB)
|
|