Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,8 @@ from mel_processing import spectrogram_torch
|
|
22 |
import translators.server as tss
|
23 |
import psutil
|
24 |
from datetime import datetime
|
|
|
|
|
25 |
|
26 |
def audio_postprocess(self, y):
|
27 |
if y is None:
|
@@ -44,7 +46,7 @@ def audio_postprocess(self, y):
|
|
44 |
gr.Audio.postprocess = audio_postprocess
|
45 |
|
46 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
47 |
-
languages = ['日本語', '简体中文', 'English']
|
48 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
49 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
50 |
'8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
|
@@ -126,19 +128,73 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
|
|
126 |
text = tss.google(text_raw, from_language='zh', to_language='ja')
|
127 |
elif language == 'English':
|
128 |
text = tss.google(text_raw, from_language='en', to_language='ja')
|
|
|
|
|
129 |
char_id = int(character.split(':')[0])
|
130 |
stn_tst = get_text(text, hps, is_symbol)
|
131 |
with torch.no_grad():
|
132 |
x_tst = stn_tst.unsqueeze(0)
|
133 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
134 |
sid = torch.LongTensor([char_id])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
136 |
currentDateAndTime = datetime.now()
|
137 |
print(f"Character {character} inference successful: {text}\n")
|
138 |
if language != '日本語':
|
139 |
print(f"translate from {language}: {text_raw}")
|
140 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
141 |
-
return (text,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
download_audio_js = """
|
144 |
() =>{{
|
@@ -173,7 +229,8 @@ if __name__ == "__main__":
|
|
173 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
174 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
175 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
176 |
-
"
|
|
|
177 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
178 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
179 |
)
|
@@ -181,7 +238,7 @@ if __name__ == "__main__":
|
|
181 |
with gr.Column():
|
182 |
# We instantiate the Textbox class
|
183 |
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
|
184 |
-
with gr.Accordion(label="
|
185 |
temp_text_var = gr.Variable()
|
186 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
187 |
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
|
@@ -226,9 +283,23 @@ if __name__ == "__main__":
|
|
226 |
text_output = gr.Textbox(label="Output Text")
|
227 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
228 |
btn = gr.Button("Generate!")
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
download = gr.Button("Download Audio")
|
233 |
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
234 |
examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
|
@@ -246,16 +317,24 @@ if __name__ == "__main__":
|
|
246 |
fn=infer
|
247 |
)
|
248 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
|
|
|
|
|
|
249 |
"2023/1/13:\n\n"
|
250 |
"增加了音素输入的example(米浴喘气)\n\n"
|
|
|
251 |
"2023/1/12:\n\n"
|
252 |
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
|
|
253 |
"调整了UI的布局。\n\n"
|
|
|
254 |
"2023/1/10:\n\n"
|
255 |
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
|
|
256 |
"2023/1/9:\n\n"
|
257 |
-
"人物全是特别周的bug已修复,对此带来的不便感到十分抱歉。\n\n"
|
258 |
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
|
|
259 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
|
|
260 |
)
|
261 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|
|
|
22 |
import translators.server as tss
|
23 |
import psutil
|
24 |
from datetime import datetime
|
25 |
+
import romajitable
|
26 |
+
from text.cleaners import japanese_cleaners
|
27 |
|
28 |
def audio_postprocess(self, y):
|
29 |
if y is None:
|
|
|
46 |
gr.Audio.postprocess = audio_postprocess
|
47 |
|
48 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
49 |
+
languages = ['日本語', '简体中文', 'English', 'English2Katakana']
|
50 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
51 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
52 |
'8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
|
|
|
128 |
text = tss.google(text_raw, from_language='zh', to_language='ja')
|
129 |
elif language == 'English':
|
130 |
text = tss.google(text_raw, from_language='en', to_language='ja')
|
131 |
+
elif language == "English2Katakana":
|
132 |
+
text = romajitable.to_kana(text_raw).katakana
|
133 |
char_id = int(character.split(':')[0])
|
134 |
stn_tst = get_text(text, hps, is_symbol)
|
135 |
with torch.no_grad():
|
136 |
x_tst = stn_tst.unsqueeze(0)
|
137 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
138 |
sid = torch.LongTensor([char_id])
|
139 |
+
jp2phoneme = japanese_cleaners(text)
|
140 |
+
durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
141 |
+
noise_scale_w=noise_scale_w, length_scale=duration)
|
142 |
+
char_dur_list = []
|
143 |
+
for i, char in enumerate(jp2phoneme):
|
144 |
+
char_pos = i * 2 + 1
|
145 |
+
char_dur = durations[char_pos]
|
146 |
+
char_dur_list.append(char_dur)
|
147 |
+
char_spacing_dur_list = []
|
148 |
+
char_spacings = []
|
149 |
+
for i in range(len(durations)):
|
150 |
+
if i % 2 == 0: # spacing
|
151 |
+
char_spacings.append("spacing")
|
152 |
+
elif i % 2 == 1: # char
|
153 |
+
char_spacings.append(jp2phoneme[int((i - 1) / 2)])
|
154 |
+
char_spacing_dur_list.append(int(durations[i]))
|
155 |
+
# convert duration information to string
|
156 |
+
duration_info_str = ""
|
157 |
+
for i in range(len(char_spacings)):
|
158 |
+
if char_spacings[i] == "spacing":
|
159 |
+
duration_info_str += str(char_spacing_dur_list[i])
|
160 |
+
else:
|
161 |
+
duration_info_str += "{" + char_spacings[i] + ":" + str(char_spacing_dur_list[i]) + "}"
|
162 |
+
if i != len(char_spacings)-1:
|
163 |
+
duration_info_str += ", "
|
164 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
165 |
currentDateAndTime = datetime.now()
|
166 |
print(f"Character {character} inference successful: {text}\n")
|
167 |
if language != '日本語':
|
168 |
print(f"translate from {language}: {text_raw}")
|
169 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
170 |
+
return (text,(22050, audio), jp2phoneme, duration_info_str)
|
171 |
+
|
172 |
+
def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
|
173 |
+
try:
|
174 |
+
phonemes = duration_info_str.split(", ")
|
175 |
+
recons_durs = []
|
176 |
+
recons_phonemes = ""
|
177 |
+
for item in phonemes:
|
178 |
+
if "{" not in item: # spacing
|
179 |
+
recons_durs.append(int(item))
|
180 |
+
else:
|
181 |
+
recons_phonemes += item.strip("{}").split(":")[0]
|
182 |
+
recons_durs.append(int(item.strip("{}").split(":")[1]))
|
183 |
+
except ValueError:
|
184 |
+
return ("Error: Format must not be changed!", None)
|
185 |
+
except AssertionError:
|
186 |
+
return ("Error: Format must not be changed!", None)
|
187 |
+
char_id = int(character.split(':')[0])
|
188 |
+
stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
|
189 |
+
with torch.no_grad():
|
190 |
+
x_tst = stn_tst.unsqueeze(0)
|
191 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
192 |
+
sid = torch.LongTensor([char_id])
|
193 |
+
print(len(recons_durs))
|
194 |
+
print(x_tst.shape[1])
|
195 |
+
audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
196 |
+
length_scale=duration)[0][0, 0].data.cpu().float().numpy()
|
197 |
+
return (recons_phonemes, (22050, audio))
|
198 |
|
199 |
download_audio_js = """
|
200 |
() =>{{
|
|
|
229 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
230 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
231 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
232 |
+
"If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
|
233 |
+
"若有bug反馈或建议,请在Community下开启一个新的Discussion。 \n\n"
|
234 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
235 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
236 |
)
|
|
|
238 |
with gr.Column():
|
239 |
# We instantiate the Textbox class
|
240 |
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
|
241 |
+
with gr.Accordion(label="Phoneme Input", open=False):
|
242 |
temp_text_var = gr.Variable()
|
243 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
244 |
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
|
|
|
283 |
text_output = gr.Textbox(label="Output Text")
|
284 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
285 |
btn = gr.Button("Generate!")
|
286 |
+
with gr.Accordion(label="Speaking Pace Control", open=True):
|
287 |
+
phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
|
288 |
+
duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here. You can edit phoneme durations here and click regenerate for more precise control.",
|
289 |
+
interactive = True)
|
290 |
+
gr.Markdown(
|
291 |
+
"\{ \}内的数字代表每个音素在生成的音频中的长度,\{ \}外的数字代表音素之间间隔的长度。"
|
292 |
+
"您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
|
293 |
+
"注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
|
294 |
+
"The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
|
295 |
+
"You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
|
296 |
+
"Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
|
297 |
+
)
|
298 |
+
cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
|
299 |
+
btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
|
300 |
+
outputs=[text_output, audio_output, phoneme_output, duration_output])
|
301 |
+
cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
|
302 |
+
outputs=[phoneme_output, audio_output])
|
303 |
download = gr.Button("Download Audio")
|
304 |
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
305 |
examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
|
|
|
317 |
fn=infer
|
318 |
)
|
319 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
320 |
+
"2023/1/24:\n\n"
|
321 |
+
"增加了对说话节奏的音素级控制。\n\n"
|
322 |
+
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
|
323 |
"2023/1/13:\n\n"
|
324 |
"增加了音素输入的example(米浴喘气)\n\n"
|
325 |
+
"Added one example of phoneme input.\n\n"
|
326 |
"2023/1/12:\n\n"
|
327 |
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
328 |
+
"Added phoneme input, which enables more precise control on output audio.\n\n"
|
329 |
"调整了UI的布局。\n\n"
|
330 |
+
"Adjusted UI arrangements.\n\n"
|
331 |
"2023/1/10:\n\n"
|
332 |
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
333 |
+
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
334 |
"2023/1/9:\n\n"
|
|
|
335 |
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
336 |
+
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
|
337 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
338 |
+
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
|
339 |
)
|
340 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|