pknayak commited on
Commit
35fac88
·
1 Parent(s): 5c60490

Updated app.py

Browse files

Added the modifications to translate and synthesize method to convert into Spanish and then synthesize using the SPeechT5 model on Dutch language

Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -12,7 +12,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
16
 
17
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
@@ -22,11 +23,12 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
22
 
23
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
 
26
  return outputs["text"]
27
 
28
 
29
- def synthesise(text):
30
  inputs = processor(text=text, return_tensors="pt")
31
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
32
  return speech.cpu()
@@ -34,15 +36,15 @@ def synthesise(text):
34
 
35
  def speech_to_speech_translation(audio):
36
  translated_text = translate(audio)
37
- synthesised_speech = synthesise(translated_text)
38
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
39
  return 16000, synthesised_speech
40
 
41
 
42
  title = "Cascaded STST"
43
  description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
 
47
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
  """
 
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
+ #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
+ processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
17
 
18
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
19
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
23
 
24
 
25
  def translate(audio):
26
+ # outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task":"translate"})
27
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
28
  return outputs["text"]
29
 
30
 
31
+ def synthesize(text):
32
  inputs = processor(text=text, return_tensors="pt")
33
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
34
  return speech.cpu()
 
36
 
37
  def speech_to_speech_translation(audio):
38
  translated_text = translate(audio)
39
+ synthesised_speech = synthesize(translated_text)
40
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
41
  return 16000, synthesised_speech
42
 
43
 
44
  title = "Cascaded STST"
45
  description = """
46
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Fine tuned version of Microsoft's
47
+ [SpeechT5 TTS](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl) model by [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi) for text-to-speech:
48
 
49
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
50
  """