speech-to-speech-translation

Runtime error

pknayak commited on Aug 31, 2023

Commit

35fac88

1 Parent(s): 5c60490

Updated app.py

Added the modifications to translate and synthesize method to convert into Spanish and then synthesize using the SPeechT5 model on Dutch language

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -12,7 +12,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
@@ -22,11 +23,12 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
     return outputs["text"]
-def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
@@ -34,15 +36,15 @@ def synthesise(text):
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
-    synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
 title = "Cascaded STST"
 description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
-[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """

 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
+#processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 def translate(audio):
+    # outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task":"translate"})
+    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
     return outputs["text"]
+def synthesize(text):
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
+    synthesised_speech = synthesize(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
 title = "Cascaded STST"
 description = """
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Fine tuned version of Microsoft's
+[SpeechT5 TTS](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl) model by [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi) for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """