Spaces:

alex16052G
/

abi

Paused

App Files Files Community

alex16052G commited on Jan 22

Commit

68d6bb1

verified ·

1 Parent(s): c0a2aa1

Update app.py

Browse files

Files changed (1) hide show

app.py +345 -37

app.py CHANGED Viewed

@@ -1,46 +1,354 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import subprocess
-from f5_infer import F5TTS
-# Instalar Spanish-F5 automáticamente desde GitHub
-def install_spanish_f5():
-    subprocess.run(["pip", "install", "--upgrade", "git+https://github.com/jpgallegoar/Spanish-F5", "--no-cache-dir"], check=True)
-# Instalar Spanish-F5
-install_spanish_f5()
-# Cargar el modelo Qwen2.5-3B-Instruct
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-2.5B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-2.5B-Instruct", device_map="auto")
-# Inicializar Spanish-F5 para síntesis de voz
-tts = F5TTS()
-# Función principal para el flujo del chat con voz
-def chat_with_voice(input_text):
-    # Generar respuesta con Qwen
-    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-    outputs = model.generate(**inputs, max_length=200)
-    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Convertir respuesta a audio usando Spanish-F5
-    audio_path = tts.generate_tts(response_text, output_path="response.wav")
-    return response_text, audio_path
-# Interfaz de Gradio
-with gr.Blocks() as demo:
-    gr.Markdown("# Chat AI con Voz (Qwen y Spanish-F5)")
-    with gr.Row():
-        input_text = gr.Textbox(label="Escribe tu mensaje:", placeholder="¿Cómo puedo ayudarte hoy?")
-    with gr.Row():
-        response_text = gr.Textbox(label="Respuesta del modelo")
-        response_audio = gr.Audio(label="Respuesta en voz", type="filepath")
-    send_btn = gr.Button("Enviar")
-    # Conectar eventos
-    send_btn.click(chat_with_voice, inputs=input_text, outputs=[response_text, response_audio])
-# Ejecutar la app
-demo.launch()

+# ruff: noqa: E402
+# Above allows ruff to ignore E402: module level import not at top of file
+import re
+import tempfile
 import gradio as gr
+import numpy as np
+import soundfile as sf
+import torchaudio
+from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from f5_tts.model import DiT
+from f5_tts.infer.utils_infer import (
+    load_vocoder,
+    load_model,
+    preprocess_ref_audio_text,
+    infer_process,
+    remove_silence_for_generated_wav,
+)
+# Intentar importar 'spaces' para determinar si se está usando Hugging Face Spaces
+try:
+    import spaces
+    USING_SPACES = True
+except ImportError:
+    USING_SPACES = False
+# Decorador para utilizar GPU si está disponible
+def gpu_decorator(func):
+    if USING_SPACES:
+        return spaces.GPU(func)
+    else:
+        return func
+# Cargar el vocoder
+vocoder = load_vocoder()
+# Cargar el modelo F5-TTS
+F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+F5TTS_ema_model = load_model(
+    DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
+)
+# Variables globales para el modelo de chat
+chat_model_state = None
+chat_tokenizer_state = None
+@gpu_decorator
+def generate_response(messages, model, tokenizer):
+    """Genera una respuesta usando el modelo de chat"""
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.95,
+    )
+    generated_ids = [
+        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    ]
+    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+def traducir_numero_a_texto(texto):
+    """Convierte números en texto a su representación en palabras en español"""
+    texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
+    texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
+    def reemplazar_numero(match):
+        numero = match.group()
+        return num2words(int(numero), lang='es')
+    texto_traducido = re.sub(r'\b\d+\b', reemplazar_numero, texto_separado)
+    return texto_traducido
+@gpu_decorator
+def infer(
+    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
+):
+    """Genera el audio sintetizado a partir del texto"""
+    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
+    ema_model = F5TTS_ema_model
+    if not gen_text.startswith(" "):
+        gen_text = " " + gen_text
+    if not gen_text.endswith(". "):
+        gen_text += ". "
+    gen_text = gen_text.lower()
+    gen_text = traducir_numero_a_texto(gen_text)
+    final_wave, final_sample_rate, combined_spectrogram = infer_process(
+        ref_audio,
+        ref_text,
+        gen_text,
+        ema_model,
+        vocoder,
+        cross_fade_duration=cross_fade_duration,
+        speed=speed,
+        show_info=show_info,
+        progress=gr.Progress(),
+    )
+    # Eliminar silencios si está activado
+    if remove_silence:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            sf.write(f.name, final_wave, final_sample_rate)
+            remove_silence_for_generated_wav(f.name)
+            final_wave, _ = torchaudio.load(f.name)
+        final_wave = final_wave.squeeze().cpu().numpy()
+    return (final_sample_rate, final_wave)
+def load_chat_model():
+    """Carga el modelo de chat y el tokenizer"""
+    global chat_model_state, chat_tokenizer_state
+    if chat_model_state is None:
+        model_name = "Qwen/Qwen2.5-3B-Instruct"
+        chat_model_state = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=torch.float16, device_map="auto"
+        )
+        chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
+    return chat_model_state, chat_tokenizer_state
+with gr.Blocks() as app_chat:
+    gr.Markdown(
+        """
+# Chat de Voz
+¡Mantén una conversación con una IA usando tu voz de referencia!
+1. Sube un clip de audio de referencia y opcionalmente su transcripción.
+2. Carga el modelo de chat.
+3. Graba tu mensaje a través de tu micrófono.
+4. La IA responderá usando la voz de referencia.
+        """
+    )
+    if not USING_SPACES:
+        load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
+        chat_interface_container = gr.Column(visible=False)
+        @gpu_decorator
+        def load_chat_model_fn():
+            load_chat_model()
+            return gr.update(visible=False), gr.update(visible=True)
+        load_chat_model_btn.click(load_chat_model_fn, outputs=[load_chat_model_btn, chat_interface_container])
+    else:
+        chat_interface_container = gr.Column()
+        load_chat_model_fn = load_chat_model
+    with chat_interface_container:
+        with gr.Row():
+            with gr.Column():
+                ref_audio_chat = gr.Audio(label="Audio de Referencia", type="filepath")
+            with gr.Column():
+                with gr.Accordion("Configuraciones Avanzadas", open=False):
+                    model_choice_chat = gr.Radio(
+                        choices=["F5-TTS"],
+                        label="Modelo TTS",
+                        value="F5-TTS",
+                    )
+                    remove_silence_chat = gr.Checkbox(
+                        label="Eliminar Silencios",
+                        value=True,
+                    )
+                    ref_text_chat = gr.Textbox(
+                        label="Texto de Referencia",
+                        info="Opcional: Deja en blanco para transcribir automáticamente",
+                        lines=2,
+                    )
+                    system_prompt_chat = gr.Textbox(
+                        label="Prompt del Sistema",
+                        value="No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
+                        lines=2,
+                    )
+        chatbot_interface = gr.Chatbot(label="Conversación")
+        with gr.Row():
+            with gr.Column():
+                audio_input_chat = gr.Microphone(
+                    label="Habla tu mensaje",
+                    type="filepath",
+                )
+                audio_output_chat = gr.Audio(label="Respuesta de la IA", autoplay=True)
+            with gr.Column():
+                text_input_chat = gr.Textbox(
+                    label="Escribe tu mensaje",
+                    lines=1,
+                )
+                send_btn_chat = gr.Button("Enviar")
+                clear_btn_chat = gr.Button("Limpiar Conversación")
+        conversation_state = gr.State(
+            value=[
+                {
+                    "role": "system",
+                    "content": "No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
+                }
+            ]
+        )
+        @gpu_decorator
+        def process_input(audio_path, text, history, conv_state):
+            """Procesa la entrada de audio o texto del usuario"""
+            if not audio_path and not text.strip():
+                return history, conv_state, ""
+            if audio_path:
+                # Aquí podrías agregar una transcripción automática si lo deseas
+                # Actualmente, asume que el texto es proporcionado si hay audio
+                # Puedes integrar Whisper u otro modelo de transcripción si es necesario
+                pass
+            if not text.strip():
+                return history, conv_state, ""
+            conv_state.append({"role": "user", "content": text})
+            history.append((text, None))
+            response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
+            conv_state.append({"role": "assistant", "content": response})
+            history[-1] = (text, response)
+            return history, conv_state, response
+        @gpu_decorator
+        def generate_audio_response(response, ref_audio, ref_text, model, remove_silence):
+            """Genera el audio de respuesta para la IA"""
+            if not response or not ref_audio:
+                return None
+            audio_result, _ = infer(
+                ref_audio,
+                ref_text,
+                response,
+                model,
+                remove_silence,
+                cross_fade_duration=0.15,
+                speed=1.0,
+                show_info=print,
+            )
+            return audio_result
+        def clear_conversation_fn():
+            """Limpia la conversación"""
+            return [], [
+                {
+                    "role": "system",
+                    "content": "No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
+                }
+            ]
+        def update_system_prompt_fn(new_prompt):
+            """Actualiza el prompt del sistema y reinicia la conversación"""
+            new_conv_state = [{"role": "system", "content": new_prompt}]
+            return [], new_conv_state
+        # Manejar la entrada de audio
+        audio_input_chat.stop_recording(
+            process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
+            outputs=[chatbot_interface, conversation_state, text_input_chat],
+        ).then(
+            generate_audio_response,
+            inputs=[text_input_chat, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
+            outputs=[audio_output_chat],
+        ).then(
+            lambda: None,
+            None,
+            audio_input_chat,
+        )
+        # Manejar la entrada de texto
+        text_input_chat.submit(
+            process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
+            outputs=[chatbot_interface, conversation_state, text_input_chat],
+        ).then(
+            generate_audio_response,
+            inputs=[text_input_chat, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
+            outputs=[audio_output_chat],
+        ).then(
+            lambda: None,
+            None,
+            text_input_chat,
+        )
+        # Manejar el botón de enviar
+        send_btn_chat.click(
+            process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
+            outputs=[chatbot_interface, conversation_state, text_input_chat],
+        ).then(
+            generate_audio_response,
+            inputs=[text_input_chat, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
+            outputs=[audio_output_chat],
+        ).then(
+            lambda: None,
+            None,
+            text_input_chat,
+        )
+        # Manejar el botón de limpiar conversación
+        clear_btn_chat.click(
+            clear_conversation_fn,
+            outputs=[chatbot_interface, conversation_state],
+        )
+        # Manejar cambios en el prompt del sistema
+        system_prompt_chat.change(
+            update_system_prompt_fn,
+            inputs=system_prompt_chat,
+            outputs=[chatbot_interface, conversation_state],
+        )
+def main():
+    if not USING_SPACES:
+        import click
+        @click.command()
+        @click.option("--port", "-p", default=None, type=int, help="Puerto para ejecutar la aplicación")
+        @click.option("--host", "-H", default=None, help="Host para ejecutar la aplicación")
+        @click.option(
+            "--share",
+            "-s",
+            default=False,
+            is_flag=True,
+            help="Compartir la aplicación a través de un enlace compartido de Gradio",
+        )
+        @click.option("--api", "-a", default=True, is_flag=True, help="Permitir acceso a la API")
+        def run_app(port, host, share, api):
+            print("Iniciando la aplicación de Chat AI...")
+            app_chat.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
+        run_app()
+    else:
+        app_chat.queue().launch()
+if __name__ == "__main__":
+    main()