benjipeng commited on
Commit
fe24b43
·
1 Parent(s): b96156b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -165
app.py CHANGED
@@ -2,194 +2,58 @@ import gradio as gr
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
 
 
 
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  # load speech translation checkpoint
11
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
12
- greek_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-el")
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
- model_id = "microsoft/speecht5_tts" # update with your model id
16
- # pipe = pipeline("automatic-speech-recognition", model=model_id)
17
- model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
- speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
21
-
22
- processor = SpeechT5Processor.from_pretrained(model_id)
23
-
24
- model_id_greek = "Sandiago21/speecht5_finetuned_google_fleurs_greek"
25
- model_greek = SpeechT5ForTextToSpeech.from_pretrained(model_id_greek)
26
- processor_greek = SpeechT5Processor.from_pretrained(model_id_greek)
27
-
28
- replacements = [
29
- ("á", "a"),
30
- ("â", "a"),
31
- ("ã", "a"),
32
- ("í", "i"),
33
- ("á", "a"),
34
- ("í", "i"),
35
- ("ñ", "n"),
36
- ("ó", "o"),
37
- ("ú", "u"),
38
- ("ü", "u"),
39
- ("á", "a"),
40
- ("ç", "c"),
41
- ("è", "e"),
42
- ("ì", "i"),
43
- ("í", "i"),
44
- ("ò", "o"),
45
- ("ó", "o"),
46
- ("ù", "u"),
47
- ("ú", "u"),
48
- ("š", "s"),
49
- ("ï", "i"),
50
- ("à", "a"),
51
- ("â", "a"),
52
- ("ç", "c"),
53
- ("è", "e"),
54
- ("ë", "e"),
55
- ("î", "i"),
56
- ("ï", "i"),
57
- ("ô", "o"),
58
- ("ù", "u"),
59
- ("û", "u"),
60
- ("ü", "u"),
61
- ("ου", "u"),
62
- ("αυ", "af"),
63
- ("ευ", "ef"),
64
- ("ει", "i"),
65
- ("οι", "i"),
66
- ("αι", "e"),
67
- ("ού", "u"),
68
- ("εί", "i"),
69
- ("οί", "i"),
70
- ("αί", "e"),
71
- ("Ά", "A"),
72
- ("Έ", "E"),
73
- ("Ή", "H"),
74
- ("Ί", "I"),
75
- ("Ό", "O"),
76
- ("Ύ", "Y"),
77
- ("Ώ", "O"),
78
- ("ΐ", "i"),
79
- ("Α", "A"),
80
- ("Β", "B"),
81
- ("Γ", "G"),
82
- ("Δ", "L"),
83
- ("Ε", "Ε"),
84
- ("Ζ", "Z"),
85
- ("Η", "I"),
86
- ("Θ", "Th"),
87
- ("Ι", "I"),
88
- ("Κ", "K"),
89
- ("Λ", "L"),
90
- ("Μ", "M"),
91
- ("Ν", "N"),
92
- ("Ξ", "Ks"),
93
- ("Ο", "O"),
94
- ("Π", "P"),
95
- ("Ρ", "R"),
96
- ("Σ", "S"),
97
- ("Τ", "T"),
98
- ("Υ", "Y"),
99
- ("Φ", "F"),
100
- ("Χ", "X"),
101
- ("Ω", "O"),
102
- ("ά", "a"),
103
- ("έ", "e"),
104
- ("ή", "i"),
105
- ("ί", "i"),
106
- ("α", "a"),
107
- ("β", "v"),
108
- ("γ", "g"),
109
- ("δ", "d"),
110
- ("ε", "e"),
111
- ("ζ", "z"),
112
- ("η", "i"),
113
- ("θ", "th"),
114
- ("ι", "i"),
115
- ("κ", "k"),
116
- ("λ", "l"),
117
- ("μ", "m"),
118
- ("ν", "n"),
119
- ("ξ", "ks"),
120
- ("ο", "o"),
121
- ("π", "p"),
122
- ("ρ", "r"),
123
- ("ς", "s"),
124
- ("σ", "s"),
125
- ("τ", "t"),
126
- ("υ", "i"),
127
- ("φ", "f"),
128
- ("χ", "h"),
129
- ("ψ", "ps"),
130
- ("ω", "o"),
131
- ("ϊ", "i"),
132
- ("ϋ", "i"),
133
- ("ό", "o"),
134
- ("ύ", "i"),
135
- ("ώ", "o"),
136
- ("í", "i"),
137
- ("õ", "o"),
138
- ("Ε", "E"),
139
- ("Ψ", "Ps"),
140
- ]
141
-
142
- def cleanup_text(text):
143
- for src, dst in replacements:
144
- text = text.replace(src, dst)
145
- return text
146
-
147
-
148
- def synthesize_speech(text):
149
- text = cleanup_text(text)
150
- inputs = processor(text=text, return_tensors="pt")
151
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
152
 
153
- return gr.Audio.update(value=(16000, speech.cpu().numpy()))
154
 
155
 
156
- def translate_to_english(audio):
157
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "english"})
158
- return outputs["text"]
159
 
160
 
161
- def synthesise_from_english(text):
162
- text = cleanup_text(text)
163
- inputs = processor(text=text, return_tensors="pt")
164
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
165
- return speech.cpu().numpy()
 
 
 
166
 
 
 
167
 
168
- def translate_from_english_to_greek(text):
169
- return greek_translation_pipe(text)[0]["translation_text"]
170
 
 
 
 
171
 
172
- def synthesise_from_greek(text):
173
- text = cleanup_text(text)
174
- inputs = processor_greek(text=text, return_tensors="pt")
175
- speech = model_greek.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
176
  return speech.cpu()
177
 
178
 
179
  def speech_to_speech_translation(audio):
180
- translated_text = translate_to_english(audio)
181
- translated_text = translate_from_english_to_greek(translated_text)
182
- # synthesised_speech = synthesise_from_english(translated_text)
183
- # translated_text = translate_from_english_to_greek(synthesised_speech)
184
- synthesised_speech = synthesise_from_greek(translated_text)
185
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
186
- return ((16000, synthesised_speech), translated_text)
187
 
188
 
189
  title = "Cascaded STST"
190
  description = """
191
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Greek. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [Sandiago21/speecht5_finetuned_google_fleurs_greek](https://huggingface.co/Sandiago21/speecht5_finetuned_google_fleurs_greek) checkpoint for text-to-speech, which is based on Microsoft's
192
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Greek Audio dataset:
193
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
194
  """
195
 
@@ -198,7 +62,7 @@ demo = gr.Blocks()
198
  mic_translate = gr.Interface(
199
  fn=speech_to_speech_translation,
200
  inputs=gr.Audio(source="microphone", type="filepath"),
201
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
202
  title=title,
203
  description=description,
204
  )
@@ -206,7 +70,7 @@ mic_translate = gr.Interface(
206
  file_translate = gr.Interface(
207
  fn=speech_to_speech_translation,
208
  inputs=gr.Audio(source="upload", type="filepath"),
209
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
210
  examples=[["./example.wav"]],
211
  title=title,
212
  description=description,
 
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
 
5
 
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
+ #from transformers import VitsModel, VitsTokenizer
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
16
 
17
 
18
+ #model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
19
+ #tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
 
20
 
21
 
22
+ #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
23
+ processor = SpeechT5Processor.from_pretrained("kfahn/speecht5_finetuned_voxpopuli_es")
24
+ #processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
25
+
26
+ #model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl").to(device)
27
+ model = SpeechT5ForTextToSpeech.from_pretrained("kfahn/speecht5_finetuned_voxpopuli_es").to(device)
28
+ #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
29
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
30
 
31
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
32
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
33
 
 
 
34
 
35
+ def translate(audio):
36
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
37
+ return outputs["text"]
38
 
39
+
40
+ def synthesise(text):
41
+ inputs = processor(text=text, return_tensors="pt")
42
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
43
  return speech.cpu()
44
 
45
 
46
  def speech_to_speech_translation(audio):
47
+ translated_text = translate(audio)
48
+ synthesised_speech = synthesise(translated_text)
 
 
 
49
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
50
+ return 16000, synthesised_speech
51
 
52
 
53
  title = "Cascaded STST"
54
  description = """
55
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
56
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
57
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
58
  """
59
 
 
62
  mic_translate = gr.Interface(
63
  fn=speech_to_speech_translation,
64
  inputs=gr.Audio(source="microphone", type="filepath"),
65
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
66
  title=title,
67
  description=description,
68
  )
 
70
  file_translate = gr.Interface(
71
  fn=speech_to_speech_translation,
72
  inputs=gr.Audio(source="upload", type="filepath"),
73
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
74
  examples=[["./example.wav"]],
75
  title=title,
76
  description=description,