Jekyll2000 commited on
Commit
04815c5
·
verified ·
1 Parent(s): fc085cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -311
app.py CHANGED
@@ -1,340 +1,204 @@
1
- from functools import lru_cache
 
 
 
2
 
3
- import sherpa_onnx
4
- from huggingface_hub import hf_hub_download
5
 
 
6
 
7
- def get_file(
8
- repo_id: str,
9
- filename: str,
10
- subfolder: str = ".",
11
- ) -> str:
12
- model_filename = hf_hub_download(
13
- repo_id=repo_id,
14
- filename=filename,
15
- subfolder=subfolder,
16
- )
17
- return model_filename
18
 
 
 
19
 
20
- @lru_cache(maxsize=10)
21
- def _get_vits_vctk(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
22
- assert repo_id == "csukuangfj/vits-vctk"
23
 
24
- model = get_file(
25
- repo_id=repo_id,
26
- filename="vits-vctk.onnx",
27
- subfolder=".",
28
- )
29
 
30
- lexicon = get_file(
31
- repo_id=repo_id,
32
- filename="lexicon.txt",
33
- subfolder=".",
34
- )
 
 
 
 
 
 
 
 
35
 
36
- tokens = get_file(
37
- repo_id=repo_id,
38
- filename="tokens.txt",
39
- subfolder=".",
40
- )
41
 
42
- tts_config = sherpa_onnx.OfflineTtsConfig(
43
- model=sherpa_onnx.OfflineTtsModelConfig(
44
- vits=sherpa_onnx.OfflineTtsVitsModelConfig(
45
- model=model,
46
- lexicon=lexicon,
47
- tokens=tokens,
48
- length_scale=1.0 / speed,
49
- ),
50
- provider="cpu",
51
- debug=True,
52
- num_threads=2,
53
  )
54
- )
55
- tts = sherpa_onnx.OfflineTts(tts_config)
56
 
57
- return tts
58
 
59
 
60
- @lru_cache(maxsize=10)
61
- def _get_vits_ljs(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
62
- assert repo_id == "csukuangfj/vits-ljs"
 
 
 
 
 
63
 
64
- model = get_file(
65
- repo_id=repo_id,
66
- filename="vits-ljs.onnx",
67
- subfolder=".",
68
- )
69
 
70
- lexicon = get_file(
71
- repo_id=repo_id,
72
- filename="lexicon.txt",
73
- subfolder=".",
74
- )
75
 
76
- tokens = get_file(
77
- repo_id=repo_id,
78
- filename="tokens.txt",
79
- subfolder=".",
80
- )
81
 
82
- tts_config = sherpa_onnx.OfflineTtsConfig(
83
- model=sherpa_onnx.OfflineTtsModelConfig(
84
- vits=sherpa_onnx.OfflineTtsVitsModelConfig(
85
- model=model,
86
- lexicon=lexicon,
87
- tokens=tokens,
88
- length_scale=1.0 / speed,
89
- ),
90
- provider="cpu",
91
- debug=True,
92
- num_threads=2,
93
  )
94
- )
95
- tts = sherpa_onnx.OfflineTts(tts_config)
96
-
97
- return tts
98
-
99
-
100
- @lru_cache(maxsize=10)
101
- def _get_vits_piper(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
102
- data_dir = "/tmp/espeak-ng-data"
103
- if "coqui" in repo_id or "vits-mms" in repo_id:
104
- name = "model"
105
- elif "piper" in repo_id:
106
- n = len("vits-piper-")
107
- name = repo_id.split("/")[1][n:]
108
- elif "mimic3" in repo_id:
109
- n = len("vits-mimic3-")
110
- name = repo_id.split("/")[1][n:]
111
- else:
112
- raise ValueError(f"Unsupported {repo_id}")
113
-
114
- if "vits-coqui-uk-mai" in repo_id or "vits-mms" in repo_id:
115
- data_dir = ""
116
-
117
- model = get_file(
118
- repo_id=repo_id,
119
- filename=f"{name}.onnx",
120
- subfolder=".",
121
- )
122
 
123
- tokens = get_file(
124
- repo_id=repo_id,
125
- filename="tokens.txt",
126
- subfolder=".",
127
- )
128
 
129
- tts_config = sherpa_onnx.OfflineTtsConfig(
130
- model=sherpa_onnx.OfflineTtsModelConfig(
131
- vits=sherpa_onnx.OfflineTtsVitsModelConfig(
132
- model=model,
133
- lexicon="",
134
- data_dir=data_dir,
135
- tokens=tokens,
136
- length_scale=1.0 / speed,
137
- ),
138
- provider="cpu",
139
- debug=True,
140
- num_threads=2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  )
142
- )
143
- tts = sherpa_onnx.OfflineTts(tts_config)
144
 
145
- return tts
146
 
147
 
148
- @lru_cache(maxsize=10)
149
- def _get_vits_mms(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
150
- return _get_vits_piper(repo_id, speed)
151
-
152
-
153
- @lru_cache(maxsize=10)
154
- def _get_vits_zh_aishell3(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
155
- assert repo_id == "csukuangfj/vits-zh-aishell3"
156
-
157
- model = get_file(
158
- repo_id=repo_id,
159
- filename="vits-aishell3.onnx",
160
- subfolder=".",
161
  )
162
 
163
- lexicon = get_file(
164
- repo_id=repo_id,
165
- filename="lexicon.txt",
166
- subfolder=".",
167
- )
168
 
169
- tokens = get_file(
170
- repo_id=repo_id,
171
- filename="tokens.txt",
172
- subfolder=".",
173
- )
174
-
175
- rule_fst = get_file(
176
- repo_id=repo_id,
177
- filename="rule.fst",
178
- subfolder=".",
179
- )
180
-
181
- tts_config = sherpa_onnx.OfflineTtsConfig(
182
- model=sherpa_onnx.OfflineTtsModelConfig(
183
- vits=sherpa_onnx.OfflineTtsVitsModelConfig(
184
- model=model,
185
- lexicon=lexicon,
186
- tokens=tokens,
187
- length_scale=1.0 / speed,
188
- ),
189
- provider="cpu",
190
- debug=True,
191
- num_threads=2,
192
- ),
193
- rule_fsts=rule_fst,
194
- )
195
- tts = sherpa_onnx.OfflineTts(tts_config)
196
-
197
- return tts
198
-
199
-
200
- @lru_cache(maxsize=10)
201
- def _get_vits_hf(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
202
- if "fanchen" in repo_id or "vits-cantonese-hf-xiaomaiiwn" in repo_id:
203
- model = repo_id.split("/")[-1]
204
- else:
205
- model = repo_id.split("-")[-1]
206
 
207
- model = get_file(
208
- repo_id=repo_id,
209
- filename=f"{model}.onnx",
210
- subfolder=".",
211
- )
212
-
213
- lexicon = get_file(
214
- repo_id=repo_id,
215
- filename="lexicon.txt",
216
- subfolder=".",
217
- )
218
-
219
- tokens = get_file(
220
- repo_id=repo_id,
221
- filename="tokens.txt",
222
- subfolder=".",
223
- )
224
-
225
- rule_fst = get_file(
226
- repo_id=repo_id,
227
- filename="rule.fst",
228
- subfolder=".",
229
- )
230
 
231
- tts_config = sherpa_onnx.OfflineTtsConfig(
232
- model=sherpa_onnx.OfflineTtsModelConfig(
233
- vits=sherpa_onnx.OfflineTtsVitsModelConfig(
234
- model=model,
235
- lexicon=lexicon,
236
- tokens=tokens,
237
- length_scale=1.0 / speed,
238
- ),
239
- provider="cpu",
240
- debug=True,
241
- num_threads=2,
242
- ),
243
- rule_fsts=rule_fst,
244
- )
245
- tts = sherpa_onnx.OfflineTts(tts_config)
246
-
247
- return tts
248
-
249
-
250
- @lru_cache(maxsize=10)
251
- def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
252
- if repo_id in english_models:
253
- return english_models[repo_id](repo_id, speed)
254
- elif repo_id in arabic_models:
255
- return arabic_models[repo_id](repo_id, speed)
256
- elif repo_id in turkish_models:
257
- return turkish_models[repo_id](repo_id, speed)
258
- elif repo_id in persian_models:
259
- return persian_models[repo_id](repo_id, speed)
260
- elif repo_id in hindi_models:
261
- return hindi_models[repo_id](repo_id, speed)
262
- elif repo_id in gujarati_models:
263
- return gujarati_models[repo_id](repo_id, speed)
264
- else:
265
- raise ValueError(f"Unsupported repo_id: {repo_id}")
266
-
267
-
268
- english_models = {
269
- "csukuangfj/vits-piper-en_US-glados": _get_vits_piper,
270
- # coqui-ai
271
- "csukuangfj/vits-coqui-en-ljspeech": _get_vits_piper,
272
- "csukuangfj/vits-coqui-en-ljspeech-neon": _get_vits_piper,
273
- "csukuangfj/vits-coqui-en-vctk": _get_vits_piper,
274
- # piper, US
275
- "csukuangfj/vits-piper-en_GB-sweetbbak-amy": _get_vits_piper,
276
- "csukuangfj/vits-piper-en_US-amy-low": _get_vits_piper,
277
- "csukuangfj/vits-piper-en_US-amy-medium": _get_vits_piper,
278
- "csukuangfj/vits-piper-en_US-arctic-medium": _get_vits_piper, # 18 speakers
279
- "csukuangfj/vits-piper-en_US-danny-low": _get_vits_piper,
280
- "csukuangfj/vits-piper-en_US-hfc_male-medium": _get_vits_piper,
281
- "csukuangfj/vits-piper-en_US-joe-medium": _get_vits_piper,
282
- "csukuangfj/vits-piper-en_US-kathleen-low": _get_vits_piper,
283
- "csukuangfj/vits-piper-en_US-kusal-medium": _get_vits_piper,
284
- "csukuangfj/vits-piper-en_US-l2arctic-medium": _get_vits_piper, # 24 speakers
285
- "csukuangfj/vits-piper-en_US-lessac-low": _get_vits_piper,
286
- "csukuangfj/vits-piper-en_US-lessac-medium": _get_vits_piper,
287
- "csukuangfj/vits-piper-en_US-lessac-high": _get_vits_piper,
288
- "csukuangfj/vits-piper-en_US-libritts-high": _get_vits_piper, # 904 speakers
289
- "csukuangfj/vits-piper-en_US-libritts_r-medium": _get_vits_piper, # 904 speakers
290
- "csukuangfj/vits-piper-en_US-ryan-low": _get_vits_piper,
291
- "csukuangfj/vits-piper-en_US-ryan-medium": _get_vits_piper,
292
- "csukuangfj/vits-piper-en_US-ryan-high": _get_vits_piper,
293
- # piper, GB
294
- "csukuangfj/vits-piper-en_GB-alan-low": _get_vits_piper,
295
- "csukuangfj/vits-piper-en_GB-alan-medium": _get_vits_piper,
296
- "csukuangfj/vits-piper-en_GB-alba-medium": _get_vits_piper,
297
- "csukuangfj/vits-piper-en_GB-jenny_dioco-medium": _get_vits_piper,
298
- "csukuangfj/vits-piper-en_GB-northern_english_male-medium": _get_vits_piper,
299
- "csukuangfj/vits-piper-en_GB-semaine-medium": _get_vits_piper,
300
- "csukuangfj/vits-piper-en_GB-southern_english_female-low": _get_vits_piper,
301
- "csukuangfj/vits-piper-en_GB-vctk-medium": _get_vits_piper,
302
- #
303
- "csukuangfj/vits-vctk": _get_vits_vctk, # 109 speakers
304
- "csukuangfj/vits-ljs": _get_vits_ljs,
305
- }
306
-
307
-
308
- arabic_models = {
309
- "csukuangfj/vits-piper-ar_JO-kareem-low": _get_vits_piper,
310
- "csukuangfj/vits-piper-ar_JO-kareem-medium": _get_vits_piper,
311
- }
312
-
313
- turkish_models = {
314
- "csukuangfj/vits-piper-tr_TR-dfki-medium": _get_vits_piper,
315
- "csukuangfj/vits-piper-tr_TR-fahrettin-medium": _get_vits_piper,
316
- }
317
-
318
-
319
- persian_models = {
320
- "csukuangfj/vits-piper-fa_IR-amir-medium": _get_vits_piper,
321
- "csukuangfj/vits-piper-fa_IR-gyro-medium": _get_vits_piper,
322
- "csukuangfj/vits-mimic3-fa-haaniye_low": _get_vits_piper,
323
- }
324
-
325
- gujarati_models = {
326
- "csukuangfj/vits-mimic3-gu_IN-cmu-indic_low": _get_vits_piper,
327
- }
328
- hindi_models = {
329
- "vosk-model-hi-0.22": _get_vits_piper,
330
- }
331
-
332
-
333
- language_to_models = {
334
- "English": list(english_models.keys()),
335
- "Arabic": list(arabic_models.keys()),
336
- "Hindi": list(hindi_models.keys()),
337
- "Gujarati": list(gujarati_models.keys()),
338
- "Persian": list(persian_models.keys()),
339
- "Turkish": list(turkish_models.keys()),
340
- }
 
1
+ import logging
2
+ import os
3
+ import time
4
+ import uuid
5
 
6
+ import gradio as gr
7
+ import soundfile as sf
8
 
9
+ from model import get_pretrained_model, language_to_models
10
 
11
+ title = "Text-to-speech (TTS)-by-Haseeb Ahmed"
 
 
 
 
 
 
 
 
 
 
12
 
13
+ description = """
14
+ This space shows how to convert text to speech with Next-gen Kaldi.
15
 
16
+ It is running on CPU within a docker container provided by Hugging Face.
 
 
17
 
18
+ """
 
 
 
 
19
 
20
+ # css style is copied from
21
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
22
+ css = """
23
+ .result {display:flex;flex-direction:column}
24
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
25
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
26
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
27
+ """
28
+
29
+ #examples = [
30
+ # ["Min-nan (闽南话)", "csukuangfj/vits-mms-nan", "ài piaǸ chiah ē iaN̂", 0, 1.0],
31
+ # ["Thai", "csukuangfj/vits-mms-tha", "ฉันรักคุณ", 0, 1.0],
32
+ #]
33
 
 
 
 
 
 
34
 
35
+ def update_model_dropdown(language: str):
36
+ if language in language_to_models:
37
+ choices = language_to_models[language]
38
+ return gr.Dropdown(
39
+ choices=choices,
40
+ value=choices[0],
41
+ interactive=True,
 
 
 
 
42
  )
 
 
43
 
44
+ raise ValueError(f"Unsupported language: {language}")
45
 
46
 
47
+ def build_html_output(s: str, style: str = "result_item_success"):
48
+ return f"""
49
+ <div class='result'>
50
+ <div class='result_item {style}'>
51
+ {s}
52
+ </div>
53
+ </div>
54
+ """
55
 
 
 
 
 
 
56
 
57
+ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
58
+ logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
59
+ sid = int(sid)
60
+ tts = get_pretrained_model(repo_id, speed)
 
61
 
62
+ start = time.time()
63
+ audio = tts.generate(text, sid=sid)
64
+ end = time.time()
 
 
65
 
66
+ if len(audio.samples) == 0:
67
+ raise ValueError(
68
+ "Error in generating audios. Please read previous error messages."
 
 
 
 
 
 
 
 
69
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ duration = len(audio.samples) / audio.sample_rate
 
 
 
 
72
 
73
+ elapsed_seconds = end - start
74
+ rtf = elapsed_seconds / duration
75
+
76
+ info = f"""
77
+ Wave duration : {duration:.3f} s <br/>
78
+ Processing time: {elapsed_seconds:.3f} s <br/>
79
+ RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
80
+ """
81
+
82
+ logging.info(info)
83
+ logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
84
+
85
+ filename = str(uuid.uuid4())
86
+ filename = f"{filename}.wav"
87
+ sf.write(
88
+ filename,
89
+ audio.samples,
90
+ samplerate=audio.sample_rate,
91
+ subtype="PCM_16",
92
+ )
93
+
94
+ return filename, build_html_output(info)
95
+
96
+
97
+ demo = gr.Blocks(css=css)
98
+
99
+
100
+ with demo:
101
+ gr.Markdown(title)
102
+ language_choices = list(language_to_models.keys())
103
+
104
+ language_radio = gr.Radio(
105
+ label="Language",
106
+ choices=language_choices,
107
+ value=language_choices[0],
108
+ )
109
+
110
+ model_dropdown = gr.Dropdown(
111
+ choices=language_to_models[language_choices[0]],
112
+ label="Select a model",
113
+ value=language_to_models[language_choices[0]][0],
114
+ )
115
+
116
+ language_radio.change(
117
+ update_model_dropdown,
118
+ inputs=language_radio,
119
+ outputs=model_dropdown,
120
+ )
121
+
122
+ with gr.Tabs():
123
+ with gr.TabItem("Please input your text"):
124
+ input_text = gr.Textbox(
125
+ label="Input text",
126
+ info="Your text",
127
+ lines=3,
128
+ placeholder="Please input your text here",
129
+ )
130
+
131
+ # input_sid = gr.Textbox(
132
+ # label="Speaker ID",
133
+ # info="Speaker ID",
134
+ # lines=1,
135
+ # max_lines=1,
136
+ # value="0",
137
+ # placeholder="Speaker ID. Valid only for mult-speaker model",
138
+ #)
139
+
140
+ # input_speed = gr.Slider(
141
+ # minimum=0.1,
142
+ # maximum=10,
143
+ # value=1,
144
+ # step=0.1,
145
+ #label="Speed (larger->faster; smaller->slower)",
146
+ #)
147
+
148
+ input_button = gr.Button("Submit")
149
+
150
+ output_audio = gr.Audio(label="Output")
151
+
152
+ output_info = gr.HTML(label="Info")
153
+
154
+ #gr.Examples(
155
+ # examples=examples,
156
+ # fn=process,
157
+ # inputs=[
158
+ # language_radio,
159
+ # model_dropdown,
160
+ # input_text,
161
+ # input_sid,
162
+ # input_speed,
163
+ # ],
164
+ # outputs=[
165
+ # output_audio,
166
+ # output_info,
167
+ # ],
168
+ # )
169
+
170
+ input_button.click(
171
+ process,
172
+ inputs=[
173
+ language_radio,
174
+ model_dropdown,
175
+ input_text,
176
+ #input_sid,
177
+ #input_speed,
178
+ ],
179
+ outputs=[
180
+ output_audio,
181
+ output_info,
182
+ ],
183
  )
 
 
184
 
185
+ gr.Markdown(description)
186
 
187
 
188
+ def download_espeak_ng_data():
189
+ os.system(
190
+ """
191
+ cd /tmp
192
+ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
193
+ tar xf espeak-ng-data.tar.bz2
194
+ """
 
 
 
 
 
 
195
  )
196
 
 
 
 
 
 
197
 
198
+ if __name__ == "__main__":
199
+ download_espeak_ng_data()
200
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ logging.basicConfig(format=formatter, level=logging.INFO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ demo.launch()