Spaces:
Runtime error
Runtime error
Updated app.py
Browse filesAdded the modifications to translate and synthesize method to convert into Spanish and then synthesize using the SPeechT5 model on Dutch language
app.py
CHANGED
@@ -12,7 +12,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
12 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
13 |
|
14 |
# load text-to-speech checkpoint and speaker embeddings
|
15 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
|
|
16 |
|
17 |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
|
18 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
@@ -22,11 +23,12 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
|
|
22 |
|
23 |
|
24 |
def translate(audio):
|
25 |
-
outputs =
|
|
|
26 |
return outputs["text"]
|
27 |
|
28 |
|
29 |
-
def
|
30 |
inputs = processor(text=text, return_tensors="pt")
|
31 |
speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
|
32 |
return speech.cpu()
|
@@ -34,15 +36,15 @@ def synthesise(text):
|
|
34 |
|
35 |
def speech_to_speech_translation(audio):
|
36 |
translated_text = translate(audio)
|
37 |
-
synthesised_speech =
|
38 |
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
39 |
return 16000, synthesised_speech
|
40 |
|
41 |
|
42 |
title = "Cascaded STST"
|
43 |
description = """
|
44 |
-
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
|
45 |
-
[SpeechT5 TTS](https://huggingface.co/
|
46 |
|
47 |

|
48 |
"""
|
|
|
12 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
13 |
|
14 |
# load text-to-speech checkpoint and speaker embeddings
|
15 |
+
#processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
16 |
+
processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
|
17 |
|
18 |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
|
19 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
|
|
23 |
|
24 |
|
25 |
def translate(audio):
|
26 |
+
# outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task":"translate"})
|
27 |
+
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
|
28 |
return outputs["text"]
|
29 |
|
30 |
|
31 |
+
def synthesize(text):
|
32 |
inputs = processor(text=text, return_tensors="pt")
|
33 |
speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
|
34 |
return speech.cpu()
|
|
|
36 |
|
37 |
def speech_to_speech_translation(audio):
|
38 |
translated_text = translate(audio)
|
39 |
+
synthesised_speech = synthesize(translated_text)
|
40 |
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
41 |
return 16000, synthesised_speech
|
42 |
|
43 |
|
44 |
title = "Cascaded STST"
|
45 |
description = """
|
46 |
+
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Fine tuned version of Microsoft's
|
47 |
+
[SpeechT5 TTS](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl) model by [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi) for text-to-speech:
|
48 |
|
49 |

|
50 |
"""
|