Luasmontesinos commited on
Commit
f8616d8
·
verified ·
1 Parent(s): 09d2db8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -0
app.py CHANGED
@@ -1,3 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
  demo = gr.Interface(
 
1
+
2
+ def translate(audio):
3
+ outputs = pipe(audio, generate_kwargs={"task": "translate","max_new_tokens":256})
4
+ return outputs["text"]
5
+
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+
8
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
9
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
10
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
11
+
12
+ model.to(device);
13
+ vocoder.to(device);
14
+
15
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
16
+ speaker_embeddings = torch.tensor(embeddings_dataset[6000]["xvector"]).unsqueeze(0)
17
+
18
+
19
+ def synthesise(text):
20
+ inputs = processor(text=text, return_tensors="pt")
21
+ speech = model.generate_speech(
22
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
23
+ )
24
+ return speech.cpu()
25
+
26
+ import numpy as np
27
+
28
+ target_dtype = np.int16
29
+ max_range = np.iinfo(target_dtype).max
30
+
31
+
32
+ def speech_to_speech_translation(audio):
33
+ translated_text = translate(audio)
34
+ synthesised_speech = synthesise(translated_text)
35
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
36
+ return 16000, synthesised_speech
37
+
38
+
39
  import gradio as gr
40
 
41
  demo = gr.Interface(