M4xjunior commited on
Commit
c6b897d
·
1 Parent(s): 30b1a43
Files changed (2) hide show
  1. app.py +52 -48
  2. logs/sentence_analyzer_2024-12-02.log +10 -0
app.py CHANGED
@@ -28,7 +28,7 @@ def gpu_decorator(func):
28
 
29
  # Importando a nova API F5TTS
30
  from f5_tts.api import F5TTS
31
- from f5_tts.infer.utils_infer import preprocess_ref_audio_text
32
 
33
  import os
34
  from huggingface_hub import hf_hub_download
@@ -68,37 +68,39 @@ training_process = None # Adicione esta linha se necessário para o seu context
68
 
69
  @gpu_decorator
70
  def infer(
71
- ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info, seed=-1
72
  ):
73
- print(nfe)
74
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
75
- ema_model = F5TTS_ema_model
76
- final_wave, final_sample_rate, combined_spectrogram = infer_process(
77
- ref_audio,
78
- ref_text.lower().strip(),
79
- gen_text.lower().strip(),
80
- ema_model,
81
- vocoder,
82
- cross_fade_duration=cross_fade_duration,
83
- nfe_step=nfe,
84
- speed=speed,
85
- show_info=show_info,
86
- progress=gr.Progress(),
87
- seed=seed # Passando o seed para infer_process
88
- )
89
- # Remover silêncios
90
- if remove_silence:
91
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
92
- sf.write(f.name, final_wave, final_sample_rate)
93
- remove_silence_for_generated_wav(f.name)
94
- final_wave, _ = torchaudio.load(f.name)
95
- final_wave = final_wave.squeeze().cpu().numpy()
96
- # Salvar espectrograma
97
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
98
- spectrogram_path = tmp_spectrogram.name
99
- save_spectrogram(combined_spectrogram, spectrogram_path)
100
- return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed # Retornando o seed
101
-
 
 
102
 
103
  # Estilos CSS
104
  custom_css = """
@@ -200,27 +202,31 @@ with gr.Blocks(css=custom_css) as app:
200
  # Processar cada chunk
201
  audio_segments = []
202
  for chunk in chunks:
203
- audio_out, spectrogram_path, ref_text_out, seed_output = infer( # Recebendo o seed de infer
 
 
 
 
 
204
  ref_audio_input,
205
- ref_text_input, # Utiliza o Texto de Referência como está
206
- chunk, # Processa o chunk atual
207
- remove_silence,
208
- cross_fade_duration_slider,
209
- speed_slider,
210
  nfe_slider,
211
- seed=seed_input, # Passando o seed para infer
 
 
 
212
  )
213
- sr, audio_data = audio_out
214
- audio_segments.append(audio_data)
215
 
216
  # Concatenar os segmentos de áudio gerados
217
  if audio_segments:
218
  final_audio_data = np.concatenate(audio_segments)
219
  return (
220
- (sr, final_audio_data), # Áudio final
221
- spectrogram_path, # Espectrograma
222
- gr.update(value=ref_text_out), # Nenhuma mudança no Texto de Referência
223
- seed_output # Retornando o seed
224
  )
225
  else:
226
  gr.Warning("Nenhum áudio gerado.")
@@ -229,7 +235,6 @@ with gr.Blocks(css=custom_css) as app:
229
  # Saídas
230
  gr.Markdown("### Resultados")
231
  audio_output = gr.Audio(label="Áudio Sintetizado")
232
- spectrogram_output = gr.Image(label="Espectrograma")
233
  seed_output = gr.Text(label="Seed usada:") # Saída do Seed
234
 
235
  # Associação do botão `generate_btn` à função `process_chunks`
@@ -248,14 +253,13 @@ with gr.Blocks(css=custom_css) as app:
248
  ],
249
  outputs=[
250
  audio_output,
251
- spectrogram_output,
252
  ref_text_input, # Atualiza o texto de referência, se necessário
253
  seed_output, # Saída do Seed
254
  ],
255
  )
256
 
257
-
258
- # Código para iniciar a aplicação Gradio
259
  @click.command()
260
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
261
  @click.option("--host", "-H", default=None, help="Host to run the app on")
 
28
 
29
  # Importando a nova API F5TTS
30
  from f5_tts.api import F5TTS
31
+ from f5_tts.infer.utils_infer import preprocess_ref_audio_text
32
 
33
  import os
34
  from huggingface_hub import hf_hub_download
 
68
 
69
  @gpu_decorator
70
  def infer(
71
+ project, file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step, use_ema, speed, seed, remove_silence
72
  ):
73
+ global last_checkpoint, last_device, tts_api, last_ema
74
+ if not os.path.isfile(file_checkpoint):
75
+ return None, "checkpoint not found!"
76
+ if training_process is not None:
77
+ device_test = "cpu"
78
+ else:
79
+ device_test = None
80
+ if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
81
+ if last_checkpoint != file_checkpoint:
82
+ last_checkpoint = file_checkpoint
83
+ if last_device != device_test:
84
+ last_device = device_test
85
+ if last_ema != use_ema:
86
+ last_ema = use_ema
87
+ vocab_file = "/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt"
88
+ tts_api = F5TTS(
89
+ model_type=exp_name, ckpt_file=file_checkpoint, vocab_file=vocab_file, device=device_test, use_ema=use_ema
90
+ )
91
+ print("update >> ", device_test, file_checkpoint, use_ema)
92
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
93
+ tts_api.infer(
94
+ gen_text=gen_text.lower().strip(),
95
+ ref_text=ref_text.lower().strip(),
96
+ ref_file=ref_audio,
97
+ nfe_step=nfe_step,
98
+ file_wave=f.name,
99
+ speed=speed,
100
+ seed=seed,
101
+ remove_silence=remove_silence,
102
+ )
103
+ return f.name, tts_api.device, str(tts_api.seed)
104
 
105
  # Estilos CSS
106
  custom_css = """
 
202
  # Processar cada chunk
203
  audio_segments = []
204
  for chunk in chunks:
205
+ # Usando a função infer correta aqui
206
+ audio_file, device_used, seed_used = infer(
207
+ "Emilia_ZH_EN_pinyin", # Substitua pelo nome do seu projeto
208
+ "/home/user/app/model_1200000.safetensors", # Substitua pelo caminho do seu checkpoint
209
+ "F5-TTS", # Ou "E2-TTS" dependendo do seu modelo
210
+ ref_text_input,
211
  ref_audio_input,
212
+ chunk,
 
 
 
 
213
  nfe_slider,
214
+ True, # use_ema - ajuste se necessário
215
+ speed_slider,
216
+ seed_input,
217
+ remove_silence,
218
  )
219
+ audio_data, _ = torchaudio.load(audio_file)
220
+ audio_segments.append(audio_data.squeeze().cpu().numpy())
221
 
222
  # Concatenar os segmentos de áudio gerados
223
  if audio_segments:
224
  final_audio_data = np.concatenate(audio_segments)
225
  return (
226
+ (24000, final_audio_data), # Áudio final - assumindo taxa de amostragem de 24000
227
+ None, # Espectrograma - não estamos gerando um espectrograma aqui
228
+ gr.update(value=ref_text_input), # Nenhuma mudança no Texto de Referência
229
+ seed_used # Retornando o seed
230
  )
231
  else:
232
  gr.Warning("Nenhum áudio gerado.")
 
235
  # Saídas
236
  gr.Markdown("### Resultados")
237
  audio_output = gr.Audio(label="Áudio Sintetizado")
 
238
  seed_output = gr.Text(label="Seed usada:") # Saída do Seed
239
 
240
  # Associação do botão `generate_btn` à função `process_chunks`
 
253
  ],
254
  outputs=[
255
  audio_output,
256
+ None, # Não estamos usando a saída do espectrograma
257
  ref_text_input, # Atualiza o texto de referência, se necessário
258
  seed_output, # Saída do Seed
259
  ],
260
  )
261
 
262
+ # Código para iniciar a aplicação Gradio
 
263
  @click.command()
264
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
265
  @click.option("--host", "-H", default=None, help="Host to run the app on")
logs/sentence_analyzer_2024-12-02.log CHANGED
@@ -10,3 +10,13 @@
10
  2024-12-02 19:03:00,745 - SentenceAnalyzer - DEBUG - Normalized whitespace
11
  2024-12-02 19:03:00,777 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
12
  2024-12-02 19:03:00,778 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
 
 
 
 
 
 
 
 
 
 
 
10
  2024-12-02 19:03:00,745 - SentenceAnalyzer - DEBUG - Normalized whitespace
11
  2024-12-02 19:03:00,777 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
12
  2024-12-02 19:03:00,778 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
13
+ 2024-12-02 19:07:26,868 - SentenceAnalyzer - DEBUG - Logger set up successfully
14
+ 2024-12-02 19:07:26,868 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
15
+ 2024-12-02 19:14:34,656 - SentenceAnalyzer - DEBUG - Starting sentence splitting
16
+ 2024-12-02 19:14:34,656 - SentenceAnalyzer - DEBUG - Normalized text using NFC
17
+ 2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
18
+ 2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
19
+ 2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
20
+ 2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Normalized whitespace
21
+ 2024-12-02 19:14:34,682 - SentenceAnalyzer - DEBUG - Split text into 2 sentences using NLTK
22
+ 2024-12-02 19:14:34,682 - SentenceAnalyzer - INFO - Split text into 2 sentences after cleanup