waysolong commited on
Commit
5c87166
·
verified ·
1 Parent(s): 279cfc3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +276 -0
app.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Search models, datasets, users...
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Posts
8
+ Docs
9
+ Solutions
10
+ Pricing
11
+
12
+
13
+
14
+ Spaces:
15
+
16
+ k2-fsa
17
+ /
18
+ text-to-speech
19
+
20
+
21
+ like
22
+ 77
23
+ App
24
+ Files
25
+ Community
26
+ 4
27
+ text-to-speech
28
+ /
29
+ app.py
30
+
31
+ csukuangfj's picture
32
+ csukuangfj
33
+ small fixes
34
+ 2401daa
35
+ about 2 months ago
36
+ raw
37
+ history
38
+ blame
39
+ contribute
40
+ delete
41
+ No virus
42
+ 6.6 kB
43
+ #!/usr/bin/env python3
44
+ #
45
+ # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
46
+ #
47
+ # See LICENSE for clarification regarding multiple authors
48
+ #
49
+ # Licensed under the Apache License, Version 2.0 (the "License");
50
+ # you may not use this file except in compliance with the License.
51
+ # You may obtain a copy of the License at
52
+ #
53
+ # http://www.apache.org/licenses/LICENSE-2.0
54
+ #
55
+ # Unless required by applicable law or agreed to in writing, software
56
+ # distributed under the License is distributed on an "AS IS" BASIS,
57
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
58
+ # See the License for the specific language governing permissions and
59
+ # limitations under the License.
60
+
61
+ # References:
62
+ # https://gradio.app/docs/#dropdown
63
+
64
+ import logging
65
+ import os
66
+ import time
67
+ import uuid
68
+
69
+ import gradio as gr
70
+ import soundfile as sf
71
+
72
+ from model import get_pretrained_model, language_to_models
73
+
74
+ title = "# Next-gen Kaldi: Text-to-speech (TTS)"
75
+
76
+ description = """
77
+ This space shows how to convert text to speech with Next-gen Kaldi.
78
+ It is running on CPU within a docker container provided by Hugging Face.
79
+ See more information by visiting the following links:
80
+ - <https://github.com/k2-fsa/sherpa-onnx>
81
+ If you want to deploy it locally, please see
82
+ <https://k2-fsa.github.io/sherpa/>
83
+ If you want to use Android APKs, please see
84
+ <https://k2-fsa.github.io/sherpa/onnx/tts/apk.html>
85
+ If you want to use Android text-to-speech engine APKs, please see
86
+ <https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html>
87
+ If you want to download an all-in-one exe for Windows, please see
88
+ <https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models>
89
+ """
90
+
91
+ # css style is copied from
92
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
93
+ css = """
94
+ .result {display:flex;flex-direction:column}
95
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
96
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
97
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
98
+ """
99
+
100
+ examples = [
101
+ ["Min-nan (闽南话)", "csukuangfj/vits-mms-nan", "ài piaǸ chiah ē iaN̂", 0, 1.0],
102
+ ["Thai", "csukuangfj/vits-mms-tha", "ฉันรักคุณ", 0, 1.0],
103
+ ]
104
+
105
+
106
+ def update_model_dropdown(language: str):
107
+ if language in language_to_models:
108
+ choices = language_to_models[language]
109
+ return gr.Dropdown(
110
+ choices=choices,
111
+ value=choices[0],
112
+ interactive=True,
113
+ )
114
+
115
+ raise ValueError(f"Unsupported language: {language}")
116
+
117
+
118
+ def build_html_output(s: str, style: str = "result_item_success"):
119
+ return f"""
120
+ <div class='result'>
121
+ <div class='result_item {style}'>
122
+ {s}
123
+ </div>
124
+ </div>
125
+ """
126
+
127
+
128
+ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
129
+ logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
130
+ sid = int(sid)
131
+ tts = get_pretrained_model(repo_id, speed)
132
+
133
+ start = time.time()
134
+ audio = tts.generate(text, sid=sid)
135
+ end = time.time()
136
+
137
+ if len(audio.samples) == 0:
138
+ raise ValueError(
139
+ "Error in generating audios. Please read previous error messages."
140
+ )
141
+
142
+ duration = len(audio.samples) / audio.sample_rate
143
+
144
+ elapsed_seconds = end - start
145
+ rtf = elapsed_seconds / duration
146
+
147
+ info = f"""
148
+ Wave duration : {duration:.3f} s <br/>
149
+ Processing time: {elapsed_seconds:.3f} s <br/>
150
+ RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
151
+ """
152
+
153
+ logging.info(info)
154
+ logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
155
+
156
+ filename = str(uuid.uuid4())
157
+ filename = f"{filename}.wav"
158
+ sf.write(
159
+ filename,
160
+ audio.samples,
161
+ samplerate=audio.sample_rate,
162
+ subtype="PCM_16",
163
+ )
164
+
165
+ return filename, build_html_output(info)
166
+
167
+
168
+ demo = gr.Blocks(css=css)
169
+
170
+
171
+ with demo:
172
+ gr.Markdown(title)
173
+ language_choices = list(language_to_models.keys())
174
+
175
+ language_radio = gr.Radio(
176
+ label="Language",
177
+ choices=language_choices,
178
+ value=language_choices[0],
179
+ )
180
+
181
+ model_dropdown = gr.Dropdown(
182
+ choices=language_to_models[language_choices[0]],
183
+ label="Select a model",
184
+ value=language_to_models[language_choices[0]][0],
185
+ )
186
+
187
+ language_radio.change(
188
+ update_model_dropdown,
189
+ inputs=language_radio,
190
+ outputs=model_dropdown,
191
+ )
192
+
193
+ with gr.Tabs():
194
+ with gr.TabItem("Please input your text"):
195
+ input_text = gr.Textbox(
196
+ label="Input text",
197
+ info="Your text",
198
+ lines=3,
199
+ placeholder="Please input your text here",
200
+ )
201
+
202
+ input_sid = gr.Textbox(
203
+ label="Speaker ID",
204
+ info="Speaker ID",
205
+ lines=1,
206
+ max_lines=1,
207
+ value="0",
208
+ placeholder="Speaker ID. Valid only for mult-speaker model",
209
+ )
210
+
211
+ input_speed = gr.Slider(
212
+ minimum=0.1,
213
+ maximum=10,
214
+ value=1,
215
+ step=0.1,
216
+ label="Speed (larger->faster; smaller->slower)",
217
+ )
218
+
219
+ input_button = gr.Button("Submit")
220
+
221
+ output_audio = gr.Audio(label="Output")
222
+
223
+ output_info = gr.HTML(label="Info")
224
+
225
+ gr.Examples(
226
+ examples=examples,
227
+ fn=process,
228
+ inputs=[
229
+ language_radio,
230
+ model_dropdown,
231
+ input_text,
232
+ input_sid,
233
+ input_speed,
234
+ ],
235
+ outputs=[
236
+ output_audio,
237
+ output_info,
238
+ ],
239
+ )
240
+
241
+ input_button.click(
242
+ process,
243
+ inputs=[
244
+ language_radio,
245
+ model_dropdown,
246
+ input_text,
247
+ input_sid,
248
+ input_speed,
249
+ ],
250
+ outputs=[
251
+ output_audio,
252
+ output_info,
253
+ ],
254
+ )
255
+
256
+ gr.Markdown(description)
257
+
258
+
259
+ def download_espeak_ng_data():
260
+ os.system(
261
+ """
262
+ cd /tmp
263
+ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
264
+ tar xf espeak-ng-data.tar.bz2
265
+ """
266
+ )
267
+
268
+
269
+ if __name__ == "__main__":
270
+ download_espeak_ng_data()
271
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
272
+
273
+ logging.basicConfig(format=formatter, level=logging.INFO)
274
+
275
+ demo.launch()
276
+