Spaces:

gitgato
/

speecht-to-speech

Runtime error

App Files Files Community

gitgato commited on May 10, 2024

Commit

4492d96

verified ·

1 Parent(s): d697ccc

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -29

app.py CHANGED Viewed

@@ -1,60 +1,54 @@
 import torch
 from transformers import pipeline
 from datasets import load_dataset
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-base"
 )
 def translate(audio):
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
     return outputs["text"]
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("gitgato/mabama")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train")
-speaker_embeddings = torch.tensor("gitgato/mabama").unsqueeze(0)
 def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(
-        inputs["input_ids"], speaker_embeddings, vocoder=vocoder
-    )
-    return speech.cpu()
-import numpy as np
 target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
-    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
     return 16000, synthesised_speech
-import gradio as gr
-demo = gr.Blocks()
-mic_translate = gr.Interface(
-    fn=speech_to_speech_translation,
-    inputs=gr.Audio(sources=["microphone"], type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
-)
-file_translate = gr.Interface(
     fn=speech_to_speech_translation,
-    inputs=gr.Audio(sources=["upload"], type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
 )
-with demo:
-    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch(debug=True)

 import torch
 from transformers import pipeline
 from datasets import load_dataset
+from transformers import SpeechT5Processor, SpeechT5ForConditionalGeneration, SpeechT5HifiGan
+import numpy as np
+import gradio as gr
+# Configurar el pipeline de reconocimiento automático de voz
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-base"
 )
+# Función para traducir texto
 def translate(audio):
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
     return outputs["text"]
+# Cargar el procesador y el modelo de SpeechT5
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForConditionalGeneration.from_pretrained("gitgato/mabama")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Cargar los datos de embeddings del hablante
 embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train")
+speaker_embeddings = torch.tensor(embeddings_dataset[0]["speaker_embedding"]).unsqueeze(0)
+# Función para sintetizar el habla
 def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate(inputs["input_ids"], speaker_embedding=speaker_embeddings, vocoder=vocoder)
+    return speech.numpy()
+# Configuración para el tipo de audio de salida
 target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max
+# Función para traducción de habla a habla
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
+    synthesised_speech = (synthesised_speech * max_range).astype(np.int16)
     return 16000, synthesised_speech
+# Interfaz de Gradio
+demo = gr.Interface(
     fn=speech_to_speech_translation,
+    inputs=gr.Audio(sources=["microphone"], type="file", label="Input Audio"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
+    title="Speech-to-Speech Translation",
+    description="Translate speech input to synthesized speech output."
 )
+# Lanzar la interfaz
+demo.launch(debug=True)