M4xjunior commited on
Commit
9bf0190
·
1 Parent(s): ebbe300
__pycache__/sentence_analyzer.cpython-310.pyc ADDED
Binary file (7.74 kB). View file
 
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import nltk
3
  nltk.download('punkt_tab')
4
  from sentence_analyzer import SentenceAnalyzer
@@ -6,7 +5,6 @@ import re
6
  import tempfile
7
  from collections import OrderedDict
8
  from importlib.resources import files
9
-
10
  import click
11
  import gradio as gr
12
  import numpy as np
@@ -17,34 +15,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
17
 
18
  try:
19
  import spaces
20
-
21
  USING_SPACES = True
22
  except ImportError:
23
  USING_SPACES = False
24
 
25
-
26
  def gpu_decorator(func):
27
  if USING_SPACES:
28
  return spaces.GPU(func)
29
  else:
30
  return func
31
 
32
-
33
- from f5_tts.model import DiT, UNetT
34
- from f5_tts.infer.utils_infer import (
35
- load_vocoder,
36
- load_model,
37
- preprocess_ref_audio_text,
38
- infer_process,
39
- remove_silence_for_generated_wav,
40
- save_spectrogram,
41
- )
42
-
43
- # Carregar vocoder
44
- vocoder = load_vocoder()
45
 
46
  import os
47
  from huggingface_hub import hf_hub_download
 
48
  def load_f5tts():
49
  # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
50
  repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
@@ -55,16 +41,32 @@ def load_f5tts():
55
  raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
56
  # Faz o download do modelo do repositório privado
57
  ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
58
-
 
59
  F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
60
- return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
61
 
62
- # Carregar modelo F5TTS
 
 
 
 
 
 
 
 
 
63
  F5TTS_ema_model = load_f5tts()
64
 
 
 
 
 
 
 
 
65
  @gpu_decorator
66
  def infer(
67
- ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info
68
  ):
69
  print(nfe)
70
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
@@ -80,6 +82,7 @@ def infer(
80
  speed=speed,
81
  show_info=show_info,
82
  progress=gr.Progress(),
 
83
  )
84
  # Remover silêncios
85
  if remove_silence:
@@ -92,7 +95,8 @@ def infer(
92
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
93
  spectrogram_path = tmp_spectrogram.name
94
  save_spectrogram(combined_spectrogram, spectrogram_path)
95
- return (final_sample_rate, final_wave), spectrogram_path, ref_text
 
96
 
97
  # Estilos CSS
98
  custom_css = """
@@ -115,7 +119,7 @@ with gr.Blocks(css=custom_css) as app:
115
  with gr.Tabs():
116
  with gr.Tab("TTS Básico"):
117
  gr.Markdown("# TTS Básico com F5-TTS")
118
-
119
  # Entradas básicas
120
  ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
121
  gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
@@ -142,7 +146,6 @@ with gr.Blocks(css=custom_css) as app:
142
  step=0.1,
143
  info="Ajuste a velocidade do áudio.",
144
  )
145
-
146
  cross_fade_duration_slider = gr.Slider(
147
  label="Duração do Cross-fade (s)",
148
  minimum=0.0,
@@ -167,9 +170,7 @@ with gr.Blocks(css=custom_css) as app:
167
  step=1,
168
  info="Ajuste NFE Step.",
169
  )
170
-
171
-
172
-
173
 
174
  analyzer = SentenceAnalyzer()
175
 
@@ -183,20 +184,21 @@ with gr.Blocks(css=custom_css) as app:
183
  speed_slider,
184
  nfe_slider,
185
  chunk_size_slider,
 
186
  ):
187
  # Dividir o texto em sentenças
188
  sentences = analyzer.split_into_sentences(gen_text_input)
189
-
190
  # Agrupar sentenças em chunks
191
  chunks = [
192
  " ".join(sentences[i : i + chunk_size_slider])
193
  for i in range(0, len(sentences), chunk_size_slider)
194
  ]
195
-
196
  # Processar cada chunk
197
  audio_segments = []
198
  for chunk in chunks:
199
- audio_out, spectrogram_path, ref_text_out = infer(
200
  ref_audio_input,
201
  ref_text_input, # Utiliza o Texto de Referência como está
202
  chunk, # Processa o chunk atual
@@ -204,10 +206,11 @@ with gr.Blocks(css=custom_css) as app:
204
  cross_fade_duration_slider,
205
  speed_slider,
206
  nfe_slider,
 
207
  )
208
  sr, audio_data = audio_out
209
  audio_segments.append(audio_data)
210
-
211
  # Concatenar os segmentos de áudio gerados
212
  if audio_segments:
213
  final_audio_data = np.concatenate(audio_segments)
@@ -215,16 +218,17 @@ with gr.Blocks(css=custom_css) as app:
215
  (sr, final_audio_data), # Áudio final
216
  spectrogram_path, # Espectrograma
217
  gr.update(value=ref_text_out), # Nenhuma mudança no Texto de Referência
 
218
  )
219
  else:
220
  gr.Warning("Nenhum áudio gerado.")
221
- return None, None, gr.update()
222
 
223
-
224
  # Saídas
225
  gr.Markdown("### Resultados")
226
  audio_output = gr.Audio(label="Áudio Sintetizado")
227
  spectrogram_output = gr.Image(label="Espectrograma")
 
228
 
229
  # Associação do botão `generate_btn` à função `process_chunks`
230
  generate_btn.click(
@@ -238,34 +242,36 @@ with gr.Blocks(css=custom_css) as app:
238
  speed_slider,
239
  nfe_slider,
240
  chunk_size_slider,
 
241
  ],
242
  outputs=[
243
  audio_output,
244
  spectrogram_output,
245
  ref_text_input, # Atualiza o texto de referência, se necessário
 
246
  ],
247
  )
248
-
249
- @click.command()
250
- @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
251
- @click.option("--host", "-H", default=None, help="Host to run the app on")
252
- @click.option(
253
- "--share",
254
- "-s",
255
- default=False,
256
- is_flag=True,
257
- help="Share the app via Gradio share link",
258
- )
259
- @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
260
- def main(port, host, share, api):
261
- global app
262
- print("Starting app...")
263
- app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
 
 
264
 
265
  if __name__ == "__main__":
266
  if not USING_SPACES:
267
  main()
268
  else:
269
  app.queue().launch()
270
-
271
-
 
 
1
  import nltk
2
  nltk.download('punkt_tab')
3
  from sentence_analyzer import SentenceAnalyzer
 
5
  import tempfile
6
  from collections import OrderedDict
7
  from importlib.resources import files
 
8
  import click
9
  import gradio as gr
10
  import numpy as np
 
15
 
16
  try:
17
  import spaces
 
18
  USING_SPACES = True
19
  except ImportError:
20
  USING_SPACES = False
21
 
 
22
  def gpu_decorator(func):
23
  if USING_SPACES:
24
  return spaces.GPU(func)
25
  else:
26
  return func
27
 
28
+ # Importando a nova API F5TTS
29
+ from f5_tts.api import F5TTS
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  import os
32
  from huggingface_hub import hf_hub_download
33
+
34
  def load_f5tts():
35
  # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
36
  repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
 
41
  raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
42
  # Faz o download do modelo do repositório privado
43
  ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
44
+
45
+ # Define as configurações do modelo (ajuste se necessário)
46
  F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
 
47
 
48
+ # Retorna a instância da API F5TTS
49
+ return F5TTS(
50
+ model_type="F5TTS_Base", # Ajuste o nome do modelo se necessário
51
+ ckpt_file=ckpt_path,
52
+ vocab_file=os.path.join(os.path.dirname(ckpt_path), "vocab.txt"), # Caminho para o arquivo vocab.txt
53
+ device="cuda" if torchaudio.cuda.is_available() else "cpu", # Define o dispositivo
54
+ use_ema=True
55
+ )
56
+
57
+ # Carregar modelo F5TTS usando a nova API
58
  F5TTS_ema_model = load_f5tts()
59
 
60
+ # Variáveis globais para o cache
61
+ last_checkpoint = None
62
+ last_device = None
63
+ last_ema = None
64
+ tts_api = None
65
+ training_process = None # Adicione esta linha se necessário para o seu contexto
66
+
67
  @gpu_decorator
68
  def infer(
69
+ ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info, seed=-1
70
  ):
71
  print(nfe)
72
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
 
82
  speed=speed,
83
  show_info=show_info,
84
  progress=gr.Progress(),
85
+ seed=seed # Passando o seed para infer_process
86
  )
87
  # Remover silêncios
88
  if remove_silence:
 
95
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
96
  spectrogram_path = tmp_spectrogram.name
97
  save_spectrogram(combined_spectrogram, spectrogram_path)
98
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed # Retornando o seed
99
+
100
 
101
  # Estilos CSS
102
  custom_css = """
 
119
  with gr.Tabs():
120
  with gr.Tab("TTS Básico"):
121
  gr.Markdown("# TTS Básico com F5-TTS")
122
+
123
  # Entradas básicas
124
  ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
125
  gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
 
146
  step=0.1,
147
  info="Ajuste a velocidade do áudio.",
148
  )
 
149
  cross_fade_duration_slider = gr.Slider(
150
  label="Duração do Cross-fade (s)",
151
  minimum=0.0,
 
170
  step=1,
171
  info="Ajuste NFE Step.",
172
  )
173
+ seed_input = gr.Number(label="Seed", value=-1, minimum=-1) # Seed na seção avançada
 
 
174
 
175
  analyzer = SentenceAnalyzer()
176
 
 
184
  speed_slider,
185
  nfe_slider,
186
  chunk_size_slider,
187
+ seed_input, # Passando o seed para process_chunks
188
  ):
189
  # Dividir o texto em sentenças
190
  sentences = analyzer.split_into_sentences(gen_text_input)
191
+
192
  # Agrupar sentenças em chunks
193
  chunks = [
194
  " ".join(sentences[i : i + chunk_size_slider])
195
  for i in range(0, len(sentences), chunk_size_slider)
196
  ]
197
+
198
  # Processar cada chunk
199
  audio_segments = []
200
  for chunk in chunks:
201
+ audio_out, spectrogram_path, ref_text_out, seed_output = infer( # Recebendo o seed de infer
202
  ref_audio_input,
203
  ref_text_input, # Utiliza o Texto de Referência como está
204
  chunk, # Processa o chunk atual
 
206
  cross_fade_duration_slider,
207
  speed_slider,
208
  nfe_slider,
209
+ seed=seed_input, # Passando o seed para infer
210
  )
211
  sr, audio_data = audio_out
212
  audio_segments.append(audio_data)
213
+
214
  # Concatenar os segmentos de áudio gerados
215
  if audio_segments:
216
  final_audio_data = np.concatenate(audio_segments)
 
218
  (sr, final_audio_data), # Áudio final
219
  spectrogram_path, # Espectrograma
220
  gr.update(value=ref_text_out), # Nenhuma mudança no Texto de Referência
221
+ seed_output # Retornando o seed
222
  )
223
  else:
224
  gr.Warning("Nenhum áudio gerado.")
225
+ return None, None, gr.update(), None # Retornando None para o seed
226
 
 
227
  # Saídas
228
  gr.Markdown("### Resultados")
229
  audio_output = gr.Audio(label="Áudio Sintetizado")
230
  spectrogram_output = gr.Image(label="Espectrograma")
231
+ seed_output = gr.Text(label="Seed usada:") # Saída do Seed
232
 
233
  # Associação do botão `generate_btn` à função `process_chunks`
234
  generate_btn.click(
 
242
  speed_slider,
243
  nfe_slider,
244
  chunk_size_slider,
245
+ seed_input, # Passando o seed como entrada
246
  ],
247
  outputs=[
248
  audio_output,
249
  spectrogram_output,
250
  ref_text_input, # Atualiza o texto de referência, se necessário
251
+ seed_output, # Saída do Seed
252
  ],
253
  )
254
+
255
+
256
+ # Código para iniciar a aplicação Gradio
257
+ @click.command()
258
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
259
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
260
+ @click.option(
261
+ "--share",
262
+ "-s",
263
+ default=False,
264
+ is_flag=True,
265
+ help="Share the app via Gradio share link",
266
+ )
267
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
268
+ def main(port, host, share, api):
269
+ global app
270
+ print("Starting app...")
271
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
272
 
273
  if __name__ == "__main__":
274
  if not USING_SPACES:
275
  main()
276
  else:
277
  app.queue().launch()
 
 
logs/sentence_analyzer_2024-12-02.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2024-12-02 18:27:53,692 - SentenceAnalyzer - DEBUG - Logger set up successfully
2
+ 2024-12-02 18:27:53,692 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
samples/country.flac DELETED
Binary file (180 kB)
 
samples/main.flac DELETED
Binary file (279 kB)
 
samples/story.toml DELETED
@@ -1,19 +0,0 @@
1
- # F5-TTS | E2-TTS
2
- model = "F5-TTS"
3
- ref_audio = "samples/main.flac"
4
- # If an empty "", transcribes the reference audio automatically.
5
- ref_text = ""
6
- gen_text = ""
7
- # File with text to generate. Ignores the text above.
8
- gen_file = "samples/story.txt"
9
- remove_silence = true
10
- output_dir = "samples"
11
-
12
- [voices.town]
13
- ref_audio = "samples/town.flac"
14
- ref_text = ""
15
-
16
- [voices.country]
17
- ref_audio = "samples/country.flac"
18
- ref_text = ""
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
samples/story.txt DELETED
@@ -1 +0,0 @@
1
- A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”
 
 
samples/town.flac DELETED
Binary file (229 kB)