speech-to-speech-translation

Sleeping

App Files Files Community

NicolasDenier commited on Jul 29, 2023

Commit

f4703fc

1 Parent(s): c63bae6

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -7

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import numpy as np
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
@@ -12,7 +11,7 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Sandiago21/speecht5_finetuned_facebook_voxpopuli_french").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
@@ -21,9 +20,13 @@ def npy_loader(path):
     np_sample = np.transpose(np.load(path))
     sample = torch.from_numpy(np_sample)
     return sample
-#embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-xvector_path = "xvectors/french_recording-bernard-candide_segment_182.npy"
-speaker_embeddings = torch.tensor(npy_loader(xvector_path))
 def translate(audio):
@@ -39,6 +42,7 @@ def synthesise(text):
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
@@ -46,8 +50,8 @@ def speech_to_speech_translation(audio):
 title = "Cascaded STST"
 description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
-[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """

 import numpy as np
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
+processor = SpeechT5Processor.from_pretrained("Sandiago21/speecht5_finetuned_facebook_voxpopuli_french") # "microsoft/speecht5_tts"
 model = SpeechT5ForTextToSpeech.from_pretrained("Sandiago21/speecht5_finetuned_facebook_voxpopuli_french").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
     np_sample = np.transpose(np.load(path))
     sample = torch.from_numpy(np_sample)
     return sample
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", streaming=True)
+speaker_embeddings = torch.tensor(next(iter(embeddings_dataset))["xvector"]).unsqueeze(0)
+#xvector_path = "xvectors/french_recording-bernard-candide_segment_182.npy"
+#speaker_embeddings = torch.tensor(npy_loader(xvector_path)[0]).unsqueeze(0)
+#print("speaker_embeddings shape", speaker_embeddings.shape)
 def translate(audio):
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
+    print("translated_text", translated_text)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
 title = "Cascaded STST"
 description = """
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in french. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
+[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech [finetuned for french language](https://huggingface.co/Sandiago21/speecht5_finetuned_facebook_voxpopuli_french):
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """