Spaces:

M4xjunior
/

locseed

Sleeping

App Files Files Community

M4xjunior commited on Dec 2, 2024

Commit

9bf0190

1 Parent(s): ebbe300

fix

Browse files

Files changed (8) hide show

__pycache__/sentence_analyzer.cpython-310.pyc +0 -0
app.py +57 -51
logs/sentence_analyzer_2024-12-02.log +2 -0
samples/country.flac +0 -0
samples/main.flac +0 -0
samples/story.toml +0 -19
samples/story.txt +0 -1
samples/town.flac +0 -0

__pycache__/sentence_analyzer.cpython-310.pyc ADDED Viewed

Binary file (7.74 kB). View file

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import nltk
 nltk.download('punkt_tab')
 from sentence_analyzer import SentenceAnalyzer
@@ -6,7 +5,6 @@ import re
 import tempfile
 from collections import OrderedDict
 from importlib.resources import files
 import click
 import gradio as gr
 import numpy as np
@@ -17,34 +15,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
     import spaces
     USING_SPACES = True
 except ImportError:
     USING_SPACES = False
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
     else:
         return func
-from f5_tts.model import DiT, UNetT
-from f5_tts.infer.utils_infer import (
-    load_vocoder,
-    load_model,
-    preprocess_ref_audio_text,
-    infer_process,
-    remove_silence_for_generated_wav,
-    save_spectrogram,
-)
-# Carregar vocoder
-vocoder = load_vocoder()
 import os
 from huggingface_hub import hf_hub_download
 def load_f5tts():
     # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
     repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
@@ -55,16 +41,32 @@ def load_f5tts():
         raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
     # Faz o download do modelo do repositório privado
     ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
     F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-    return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
-# Carregar modelo F5TTS
 F5TTS_ema_model = load_f5tts()
 @gpu_decorator
 def infer(
-    ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info
 ):
     print(nfe)
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
@@ -80,6 +82,7 @@ def infer(
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
     )
     # Remover silêncios
     if remove_silence:
@@ -92,7 +95,8 @@ def infer(
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
         save_spectrogram(combined_spectrogram, spectrogram_path)
-    return (final_sample_rate, final_wave), spectrogram_path, ref_text
 # Estilos CSS
 custom_css = """
@@ -115,7 +119,7 @@ with gr.Blocks(css=custom_css) as app:
     with gr.Tabs():
         with gr.Tab("TTS Básico"):
             gr.Markdown("# TTS Básico com F5-TTS")
             # Entradas básicas
             ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
             gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
@@ -142,7 +146,6 @@ with gr.Blocks(css=custom_css) as app:
                     step=0.1,
                     info="Ajuste a velocidade do áudio.",
                 )
                 cross_fade_duration_slider = gr.Slider(
                     label="Duração do Cross-fade (s)",
                     minimum=0.0,
@@ -167,9 +170,7 @@ with gr.Blocks(css=custom_css) as app:
                     step=1,
                     info="Ajuste NFE Step.",
                 )
             analyzer = SentenceAnalyzer()
@@ -183,20 +184,21 @@ with gr.Blocks(css=custom_css) as app:
                 speed_slider,
                 nfe_slider,
                 chunk_size_slider,
             ):
                 # Dividir o texto em sentenças
                 sentences = analyzer.split_into_sentences(gen_text_input)
                 # Agrupar sentenças em chunks
                 chunks = [
                     " ".join(sentences[i : i + chunk_size_slider])
                     for i in range(0, len(sentences), chunk_size_slider)
                 ]
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
-                    audio_out, spectrogram_path, ref_text_out = infer(
                         ref_audio_input,
                         ref_text_input,  # Utiliza o Texto de Referência como está
                         chunk,  # Processa o chunk atual
@@ -204,10 +206,11 @@ with gr.Blocks(css=custom_css) as app:
                         cross_fade_duration_slider,
                         speed_slider,
                         nfe_slider,
                     )
                     sr, audio_data = audio_out
                     audio_segments.append(audio_data)
                 # Concatenar os segmentos de áudio gerados
                 if audio_segments:
                     final_audio_data = np.concatenate(audio_segments)
@@ -215,16 +218,17 @@ with gr.Blocks(css=custom_css) as app:
                         (sr, final_audio_data),  # Áudio final
                         spectrogram_path,  # Espectrograma
                         gr.update(value=ref_text_out),  # Nenhuma mudança no Texto de Referência
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
-                    return None, None, gr.update()
             # Saídas
             gr.Markdown("### Resultados")
             audio_output = gr.Audio(label="Áudio Sintetizado")
             spectrogram_output = gr.Image(label="Espectrograma")
             # Associação do botão `generate_btn` à função `process_chunks`
             generate_btn.click(
@@ -238,34 +242,36 @@ with gr.Blocks(css=custom_css) as app:
                     speed_slider,
                     nfe_slider,
                     chunk_size_slider,
                 ],
                 outputs=[
                     audio_output,
                     spectrogram_output,
                     ref_text_input,  # Atualiza o texto de referência, se necessário
                 ],
             )
-@click.command()
-@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
-@click.option("--host", "-H", default=None, help="Host to run the app on")
-@click.option(
-    "--share",
-    "-s",
-    default=False,
-    is_flag=True,
-    help="Share the app via Gradio share link",
-)
-@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
-def main(port, host, share, api):
-    global app
-    print("Starting app...")
-    app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
 if __name__ == "__main__":
     if not USING_SPACES:
         main()
     else:
         app.queue().launch()

 import nltk
 nltk.download('punkt_tab')
 from sentence_analyzer import SentenceAnalyzer
 import tempfile
 from collections import OrderedDict
 from importlib.resources import files
 import click
 import gradio as gr
 import numpy as np
 try:
     import spaces
     USING_SPACES = True
 except ImportError:
     USING_SPACES = False
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
     else:
         return func
+# Importando a nova API F5TTS
+from f5_tts.api import F5TTS
 import os
 from huggingface_hub import hf_hub_download
 def load_f5tts():
     # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
     repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
         raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
     # Faz o download do modelo do repositório privado
     ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
+    # Define as configurações do modelo (ajuste se necessário)
     F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    # Retorna a instância da API F5TTS
+    return F5TTS(
+        model_type="F5TTS_Base",  # Ajuste o nome do modelo se necessário
+        ckpt_file=ckpt_path,
+        vocab_file=os.path.join(os.path.dirname(ckpt_path), "vocab.txt"), # Caminho para o arquivo vocab.txt
+        device="cuda" if torchaudio.cuda.is_available() else "cpu", # Define o dispositivo
+        use_ema=True
+    )
+# Carregar modelo F5TTS usando a nova API
 F5TTS_ema_model = load_f5tts()
+# Variáveis globais para o cache
+last_checkpoint = None
+last_device = None
+last_ema = None
+tts_api = None
+training_process = None  # Adicione esta linha se necessário para o seu contexto
 @gpu_decorator
 def infer(
+    ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info, seed=-1
 ):
     print(nfe)
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
+        seed=seed  # Passando o seed para infer_process
     )
     # Remover silêncios
     if remove_silence:
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
         save_spectrogram(combined_spectrogram, spectrogram_path)
+    return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed # Retornando o seed
 # Estilos CSS
 custom_css = """
     with gr.Tabs():
         with gr.Tab("TTS Básico"):
             gr.Markdown("# TTS Básico com F5-TTS")
             # Entradas básicas
             ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
             gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
                     step=0.1,
                     info="Ajuste a velocidade do áudio.",
                 )
                 cross_fade_duration_slider = gr.Slider(
                     label="Duração do Cross-fade (s)",
                     minimum=0.0,
                     step=1,
                     info="Ajuste NFE Step.",
                 )
+                seed_input = gr.Number(label="Seed", value=-1, minimum=-1)  # Seed na seção avançada
             analyzer = SentenceAnalyzer()
                 speed_slider,
                 nfe_slider,
                 chunk_size_slider,
+                seed_input,  # Passando o seed para process_chunks
             ):
                 # Dividir o texto em sentenças
                 sentences = analyzer.split_into_sentences(gen_text_input)
                 # Agrupar sentenças em chunks
                 chunks = [
                     " ".join(sentences[i : i + chunk_size_slider])
                     for i in range(0, len(sentences), chunk_size_slider)
                 ]
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
+                    audio_out, spectrogram_path, ref_text_out, seed_output = infer(  # Recebendo o seed de infer
                         ref_audio_input,
                         ref_text_input,  # Utiliza o Texto de Referência como está
                         chunk,  # Processa o chunk atual
                         cross_fade_duration_slider,
                         speed_slider,
                         nfe_slider,
+                        seed=seed_input,  # Passando o seed para infer
                     )
                     sr, audio_data = audio_out
                     audio_segments.append(audio_data)
                 # Concatenar os segmentos de áudio gerados
                 if audio_segments:
                     final_audio_data = np.concatenate(audio_segments)
                         (sr, final_audio_data),  # Áudio final
                         spectrogram_path,  # Espectrograma
                         gr.update(value=ref_text_out),  # Nenhuma mudança no Texto de Referência
+                        seed_output  # Retornando o seed
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
+                    return None, None, gr.update(), None  # Retornando None para o seed
             # Saídas
             gr.Markdown("### Resultados")
             audio_output = gr.Audio(label="Áudio Sintetizado")
             spectrogram_output = gr.Image(label="Espectrograma")
+            seed_output = gr.Text(label="Seed usada:")  # Saída do Seed
             # Associação do botão `generate_btn` à função `process_chunks`
             generate_btn.click(
                     speed_slider,
                     nfe_slider,
                     chunk_size_slider,
+                    seed_input,  # Passando o seed como entrada
                 ],
                 outputs=[
                     audio_output,
                     spectrogram_output,
                     ref_text_input,  # Atualiza o texto de referência, se necessário
+                    seed_output,  # Saída do Seed
                 ],
             )
+    # Código para iniciar a aplicação Gradio
+    @click.command()
+    @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
+    @click.option("--host", "-H", default=None, help="Host to run the app on")
+    @click.option(
+        "--share",
+        "-s",
+        default=False,
+        is_flag=True,
+        help="Share the app via Gradio share link",
+    )
+    @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
+    def main(port, host, share, api):
+        global app
+        print("Starting app...")
+        app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
 if __name__ == "__main__":
     if not USING_SPACES:
         main()
     else:
         app.queue().launch()

logs/sentence_analyzer_2024-12-02.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2024-12-02 18:27:53,692 - SentenceAnalyzer - DEBUG - Logger set up successfully
2	+ 2024-12-02 18:27:53,692 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully

samples/country.flac DELETED Viewed

Binary file (180 kB)

samples/main.flac DELETED Viewed

Binary file (279 kB)

samples/story.toml DELETED Viewed

@@ -1,19 +0,0 @@
-# F5-TTS | E2-TTS
-model = "F5-TTS"
-ref_audio = "samples/main.flac"
-# If an empty "", transcribes the reference audio automatically.
-ref_text = ""
-gen_text = ""
-# File with text to generate. Ignores the text above.
-gen_file = "samples/story.txt"
-remove_silence = true
-output_dir = "samples"
-[voices.town]
-ref_audio = "samples/town.flac"
-ref_text = ""
-[voices.country]
-ref_audio = "samples/country.flac"
-ref_text = ""

samples/story.txt DELETED Viewed

@@ -1 +0,0 @@

- A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”

samples/town.flac DELETED Viewed

Binary file (229 kB)