Spaces:

M4xjunior
/

locseed

Running

App Files Files Community

M4xjunior commited on Dec 2, 2024

Commit

c6b897d

1 Parent(s): 30b1a43

fix

Browse files

Files changed (2) hide show

app.py +52 -48
logs/sentence_analyzer_2024-12-02.log +10 -0

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ def gpu_decorator(func):
 # Importando a nova API F5TTS
 from f5_tts.api import F5TTS
-from f5_tts.infer.utils_infer import preprocess_ref_audio_text
 import os
 from huggingface_hub import hf_hub_download
@@ -68,37 +68,39 @@ training_process = None  # Adicione esta linha se necessário para o seu context
 @gpu_decorator
 def infer(
-    ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info, seed=-1
 ):
-    print(nfe)
-    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
-    ema_model = F5TTS_ema_model
-    final_wave, final_sample_rate, combined_spectrogram = infer_process(
-        ref_audio,
-        ref_text.lower().strip(),
-        gen_text.lower().strip(),
-        ema_model,
-        vocoder,
-        cross_fade_duration=cross_fade_duration,
-        nfe_step=nfe,
-        speed=speed,
-        show_info=show_info,
-        progress=gr.Progress(),
-        seed=seed  # Passando o seed para infer_process
-    )
-    # Remover silêncios
-    if remove_silence:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-            sf.write(f.name, final_wave, final_sample_rate)
-            remove_silence_for_generated_wav(f.name)
-            final_wave, _ = torchaudio.load(f.name)
-        final_wave = final_wave.squeeze().cpu().numpy()
-    # Salvar espectrograma
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-        spectrogram_path = tmp_spectrogram.name
-        save_spectrogram(combined_spectrogram, spectrogram_path)
-    return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed # Retornando o seed
 # Estilos CSS
 custom_css = """
@@ -200,27 +202,31 @@ with gr.Blocks(css=custom_css) as app:
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
-                    audio_out, spectrogram_path, ref_text_out, seed_output = infer(  # Recebendo o seed de infer
                         ref_audio_input,
-                        ref_text_input,  # Utiliza o Texto de Referência como está
-                        chunk,  # Processa o chunk atual
-                        remove_silence,
-                        cross_fade_duration_slider,
-                        speed_slider,
                         nfe_slider,
-                        seed=seed_input,  # Passando o seed para infer
                     )
-                    sr, audio_data = audio_out
-                    audio_segments.append(audio_data)
                 # Concatenar os segmentos de áudio gerados
                 if audio_segments:
                     final_audio_data = np.concatenate(audio_segments)
                     return (
-                        (sr, final_audio_data),  # Áudio final
-                        spectrogram_path,  # Espectrograma
-                        gr.update(value=ref_text_out),  # Nenhuma mudança no Texto de Referência
-                        seed_output  # Retornando o seed
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
@@ -229,7 +235,6 @@ with gr.Blocks(css=custom_css) as app:
             # Saídas
             gr.Markdown("### Resultados")
             audio_output = gr.Audio(label="Áudio Sintetizado")
-            spectrogram_output = gr.Image(label="Espectrograma")
             seed_output = gr.Text(label="Seed usada:")  # Saída do Seed
             # Associação do botão `generate_btn` à função `process_chunks`
@@ -248,14 +253,13 @@ with gr.Blocks(css=custom_css) as app:
                 ],
                 outputs=[
                     audio_output,
-                    spectrogram_output,
                     ref_text_input,  # Atualiza o texto de referência, se necessário
                     seed_output,  # Saída do Seed
                 ],
             )
-    # Código para iniciar a aplicação Gradio
     @click.command()
     @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
     @click.option("--host", "-H", default=None, help="Host to run the app on")

 # Importando a nova API F5TTS
 from f5_tts.api import F5TTS
+from f5_tts.infer.utils_infer import preprocess_ref_audio_text
 import os
 from huggingface_hub import hf_hub_download
 @gpu_decorator
 def infer(
+    project, file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step, use_ema, speed, seed, remove_silence
 ):
+    global last_checkpoint, last_device, tts_api, last_ema
+    if not os.path.isfile(file_checkpoint):
+        return None, "checkpoint not found!"
+    if training_process is not None:
+        device_test = "cpu"
+    else:
+        device_test = None
+    if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
+        if last_checkpoint != file_checkpoint:
+            last_checkpoint = file_checkpoint
+        if last_device != device_test:
+            last_device = device_test
+        if last_ema != use_ema:
+            last_ema = use_ema
+        vocab_file = "/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt"
+        tts_api = F5TTS(
+            model_type=exp_name, ckpt_file=file_checkpoint, vocab_file=vocab_file, device=device_test, use_ema=use_ema
+        )
+        print("update >> ", device_test, file_checkpoint, use_ema)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        tts_api.infer(
+            gen_text=gen_text.lower().strip(),
+            ref_text=ref_text.lower().strip(),
+            ref_file=ref_audio,
+            nfe_step=nfe_step,
+            file_wave=f.name,
+            speed=speed,
+            seed=seed,
+            remove_silence=remove_silence,
+        )
+        return f.name, tts_api.device, str(tts_api.seed)
 # Estilos CSS
 custom_css = """
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
+                    # Usando a função infer correta aqui
+                    audio_file, device_used, seed_used = infer(
+                        "Emilia_ZH_EN_pinyin", # Substitua pelo nome do seu projeto
+                        "/home/user/app/model_1200000.safetensors", # Substitua pelo caminho do seu checkpoint
+                        "F5-TTS", # Ou "E2-TTS" dependendo do seu modelo
+                        ref_text_input,
                         ref_audio_input,
+                        chunk,
                         nfe_slider,
+                        True, # use_ema - ajuste se necessário
+                        speed_slider,
+                        seed_input,
+                        remove_silence,
                     )
+                    audio_data, _ = torchaudio.load(audio_file)
+                    audio_segments.append(audio_data.squeeze().cpu().numpy())
                 # Concatenar os segmentos de áudio gerados
                 if audio_segments:
                     final_audio_data = np.concatenate(audio_segments)
                     return (
+                        (24000, final_audio_data),  # Áudio final - assumindo taxa de amostragem de 24000
+                        None,  # Espectrograma - não estamos gerando um espectrograma aqui
+                        gr.update(value=ref_text_input),  # Nenhuma mudança no Texto de Referência
+                        seed_used  # Retornando o seed
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
             # Saídas
             gr.Markdown("### Resultados")
             audio_output = gr.Audio(label="Áudio Sintetizado")
             seed_output = gr.Text(label="Seed usada:")  # Saída do Seed
             # Associação do botão `generate_btn` à função `process_chunks`
                 ],
                 outputs=[
                     audio_output,
+                    None,  # Não estamos usando a saída do espectrograma
                     ref_text_input,  # Atualiza o texto de referência, se necessário
                     seed_output,  # Saída do Seed
                 ],
             )
+# Código para iniciar a aplicação Gradio
     @click.command()
     @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
     @click.option("--host", "-H", default=None, help="Host to run the app on")

logs/sentence_analyzer_2024-12-02.log CHANGED Viewed

@@ -10,3 +10,13 @@
 2024-12-02 19:03:00,745 - SentenceAnalyzer - DEBUG - Normalized whitespace
 2024-12-02 19:03:00,777 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
 2024-12-02 19:03:00,778 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup

 2024-12-02 19:03:00,745 - SentenceAnalyzer - DEBUG - Normalized whitespace
 2024-12-02 19:03:00,777 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
 2024-12-02 19:03:00,778 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
+2024-12-02 19:07:26,868 - SentenceAnalyzer - DEBUG - Logger set up successfully
+2024-12-02 19:07:26,868 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
+2024-12-02 19:14:34,656 - SentenceAnalyzer - DEBUG - Starting sentence splitting
+2024-12-02 19:14:34,656 - SentenceAnalyzer - DEBUG - Normalized text using NFC
+2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
+2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
+2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
+2024-12-02 19:14:34,657 - SentenceAnalyzer - DEBUG - Normalized whitespace
+2024-12-02 19:14:34,682 - SentenceAnalyzer - DEBUG - Split text into 2 sentences using NLTK
+2024-12-02 19:14:34,682 - SentenceAnalyzer - INFO - Split text into 2 sentences after cleanup