File size: 20,835 Bytes
a956529 19a8f04 08a99f7 19a8f04 210468c 19a8f04 210468c 19a8f04 210468c 19a8f04 210468c 19a8f04 0c80792 eac9d77 887727d 39c9cf7 887727d 19a8f04 887727d 19a8f04 887727d dffc7d4 a36cbf2 b2d8161 27e967e d83b987 d65174e d83b987 27e967e e7373fa 210468c e7373fa 210468c e7373fa 08a99f7 210468c 63d9e41 210468c 08a99f7 d82c773 e7373fa b2d8161 40b5ae0 4ff2b18 b2d8161 40b5ae0 4ff2b18 dffc7d4 e7373fa dffc7d4 40b5ae0 4ff2b18 b2d8161 40b5ae0 4ff2b18 e7373fa 40b5ae0 210468c 40b5ae0 210468c 40b5ae0 210468c e7373fa 210468c 9b9ae6e d8c03e3 5d2aa5c d83b987 33fa82e 5d2aa5c 4ff2b18 39c9cf7 b89ec7e 39c9cf7 b89ec7e 08a99f7 887727d 4c68682 40b5ae0 39c9cf7 b89ec7e 39c9cf7 b89ec7e 39c9cf7 4c68682 39c9cf7 210468c 19a8f04 27e967e 74242f2 27e967e ac19060 27e967e 08a99f7 b89ec7e 27e967e e7373fa 39c9cf7 e7373fa b33cb65 e7373fa b33cb65 e7373fa b33cb65 e7373fa 27e967e b2d8161 27e967e e7373fa 27e967e 6edba4c 19a8f04 5051298 f5b3018 01c8f17 f5b3018 39c9cf7 6edba4c f5b3018 39c9cf7 7916677 39c9cf7 b89ec7e 39c9cf7 01c8f17 f5b3018 5876035 e7373fa 5876035 27e967e e7373fa 27e967e 5b0b194 39c9cf7 b89ec7e 39c9cf7 b89ec7e 5876035 39c9cf7 b89ec7e e7373fa 39c9cf7 b89ec7e 39c9cf7 b89ec7e ac19060 39c9cf7 b89ec7e 5b0b194 39c9cf7 b89ec7e 39c9cf7 b89ec7e 5b0b194 74242f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
import argparse
import json
import os
import re
import tempfile
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import ONNXVITS_infer
import librosa
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from text.symbols import symbols
from mel_processing import spectrogram_torch
import translators.server as tss
import psutil
from datetime import datetime
from text.cleaners import japanese_cleaners
def audio_postprocess(self, y):
if y is None:
return None
if gr_utils.validate_url(y):
file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
elif isinstance(y, tuple):
sample_rate, data = y
file = tempfile.NamedTemporaryFile(
suffix=".wav", dir=self.temp_dir, delete=False
)
gr_processing_utils.audio_to_file(sample_rate, data, file.name)
else:
file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)
return gr_processing_utils.encode_url_or_file_to_base64(file.name)
gr.Audio.postprocess = audio_postprocess
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
languages = ['日本語', '简体中文', 'English']
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
'8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
'12:目白麦昆', '13:神鹰', '14:好歌剧', '15:成田白仁',
'16:鲁道夫象征', '17:气槽', '18:爱丽数码', '19:青云天空',
'20:玉藻十字', '21:美妙姿势', '22:琵琶晨光', '23:重炮',
'24:曼城茶座', '25:美普波旁', '26:目白雷恩', '27:菱曙',
'28:雪之美人', '29:米浴', '30:艾尼斯风神', '31:爱丽速子',
'32:爱慕织姬', '33:稻荷一', '34:胜利奖券', '35:空中神宫',
'36:荣进闪耀', '37:真机伶', '38:川上公主', '39:黄金城市',
'40:樱花进王', '41:采珠', '42:新光风', '43:东商变革',
'44:超级小溪', '45:醒目飞鹰', '46:荒漠英雄', '47:东瀛佐敦',
'48:中山庆典', '49:成田大进', '50:西野花', '51:春乌拉拉',
'52:青竹回忆', '53:微光飞驹', '54:美丽周日', '55:待兼福来',
'56:Mr.C.B', '57:名将怒涛', '58:目白多伯', '59:优秀素质',
'60:帝王光环', '61:待兼诗歌剧', '62:生野狄杜斯', '63:目白善信',
'64:大拓太阳神', '65:双涡轮', '66:里见光钻', '67:北部玄驹',
'68:樱花千代王', '69:天狼星象征', '70:目白阿尔丹', '71:八重无敌',
'72:鹤丸刚志', '73:目白光明', '74:樱花桂冠', '75:成田路',
'76:也文摄辉', '77:吉兆', '78:谷野美酒', '79:第一红宝石',
'80:真弓快车', '81:骏川手纲', '82:凯斯奇迹', '83:小林历奇',
'84:北港火山', '85:奇锐骏', '86:秋川理事长']
def show_memory_info(hint):
pid = os.getpid()
p = psutil.Process(pid)
info = p.memory_info()
memory = info.rss / 1024.0 / 1024
print("{} 内存占用: {} MB".format(hint, memory))
def text_to_phoneme(text, symbols, is_symbol):
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
sequence = ""
if not is_symbol:
clean_text = japanese_cleaners(text)
else:
clean_text = text
for symbol in clean_text:
if symbol not in _symbol_to_id.keys():
continue
symbol_id = _symbol_to_id[symbol]
sequence += symbol
return sequence
def get_text(text, hps, is_symbol):
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
hps = utils.get_hparams_from_file("./configs/uma87.json")
symbols = hps.symbols
net_g = ONNXVITS_infer.SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)
def to_symbol_fn(is_symbol_input, input_text, temp_text):
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
else (temp_text, temp_text)
def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, is_symbol):
# check character & duraction parameter
if language not in languages:
print("Error: No such language\n")
return "Error: No such language", None, None, None
if character not in characters:
print("Error: No such character\n")
return "Error: No such character", None, None, None
# check text length
if limitation:
text_len = len(text_raw) if is_symbol else len(re.sub("\[([A-Z]{2})\]", "", text_raw))
max_len = 150
if is_symbol:
max_len *= 3
if text_len > max_len:
print(f"Refused: Text too long ({text_len}).")
return "Error: Text is too long", None, None, None
if text_len == 0:
print("Refused: Text length is zero.")
return "Error: Please input text!", None, None, None
if is_symbol:
text = text_raw
elif language == '日本語':
text = text_raw
elif language == '简体中文':
text = tss.google(text_raw, from_language='zh', to_language='ja')
elif language == 'English':
text = tss.google(text_raw, from_language='en', to_language='ja')
char_id = int(character.split(':')[0])
stn_tst = get_text(text, hps, is_symbol)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([char_id])
try:
jp2phoneme = text_to_phoneme(text, hps.symbols, is_symbol)
durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w, length_scale=duration)
char_dur_list = []
for i, char in enumerate(jp2phoneme):
char_pos = i * 2 + 1
char_dur = durations[char_pos]
char_dur_list.append(char_dur)
except IndexError:
print("Refused: Phoneme input contains non-phoneme character.")
return "Error: You can only input phoneme under phoneme input model", None, None, None
char_spacing_dur_list = []
char_spacings = []
for i in range(len(durations)):
if i % 2 == 0: # spacing
char_spacings.append("spacing")
elif i % 2 == 1: # char
char_spacings.append(jp2phoneme[int((i - 1) / 2)])
char_spacing_dur_list.append(int(durations[i]))
# convert duration information to string
duration_info_str = ""
for i in range(len(char_spacings)):
if i == len(char_spacings) - 1:
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
elif char_spacings[i] == "spacing":
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
else:
duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
currentDateAndTime = datetime.now()
print(f"\nCharacter {character} inference successful: {text}")
if language != '日本語':
print(f"translate from {language}: {text_raw}")
show_memory_info(str(currentDateAndTime) + " infer调用后")
return (text,(22050, audio), jp2phoneme, duration_info_str)
def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
try:
phonemes = duration_info_str.split(", ")
recons_durs = []
recons_phonemes = ""
for i, item in enumerate(phonemes):
if i == 0:
recons_durs.append(int(item.strip("()")))
else:
phoneme_n_dur, spacing_dur = item.split("(")
recons_phonemes += phoneme_n_dur.split(":")[0]
recons_durs.append(int(phoneme_n_dur.split(":")[1]))
recons_durs.append(int(spacing_dur.strip(")")))
except ValueError:
return ("Error: Format must not be changed!", None)
except AssertionError:
return ("Error: Format must not be changed!", None)
char_id = int(character.split(':')[0])
stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([char_id])
audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
length_scale=duration)[0][0, 0].data.cpu().float().numpy()
print(f"\nCharacter {character} inference successful: {recons_phonemes}, from {duration_info_str}")
return (recons_phonemes, (22050, audio))
download_audio_js = """
() =>{{
let root = document.querySelector("body > gradio-app");
if (root.shadowRoot != null)
root = root.shadowRoot;
let audio = root.querySelector("#{audio_id}").querySelector("audio");
if (audio == undefined)
return;
audio = audio.src;
let oA = document.createElement("a");
oA.download = Math.floor(Math.random()*100000000)+'.wav';
oA.href = audio;
document.body.appendChild(oA);
oA.click();
oA.remove();
}}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
app = gr.Blocks()
with app:
gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n"
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
"This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
"这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。[Dataset Link](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
"[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
"You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
)
with gr.Row():
with gr.Column():
# We instantiate the Textbox class
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
with gr.Accordion(label="Phoneme Input", open=False):
temp_text_var = gr.Variable()
symbol_input = gr.Checkbox(value=False, label="Symbol input")
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
samples=[[x] for x in symbols],
elem_id=f"symbol-list")
symbol_list_json = gr.Json(value=symbols, visible=False)
symbol_input.change(to_symbol_fn,
[symbol_input, textbox, temp_text_var],
[textbox, temp_text_var])
symbol_list.click(None, [symbol_list, symbol_list_json], textbox,
_js=f"""
(i, symbols, text) => {{
let root = document.querySelector("body > gradio-app");
if (root.shadowRoot != null)
root = root.shadowRoot;
let text_input = root.querySelector("#tts-input").querySelector("textarea");
let startPos = text_input.selectionStart;
let endPos = text_input.selectionEnd;
let oldTxt = text_input.value;
let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
text_input.value = result;
let x = window.scrollX, y = window.scrollY;
text_input.focus();
text_input.selectionStart = startPos + symbols[i].length;
text_input.selectionEnd = startPos + symbols[i].length;
text_input.blur();
window.scrollTo(x, y);
text = text_input.value;
return text;
}}""")
# select character
char_dropdown = gr.Dropdown(choices=characters, value = "0:特别周", label='character')
language_dropdown = gr.Dropdown(choices=languages, value = "日本語", label='language')
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
noise_scale_slider = gr.Slider(minimum=0.1, maximum=5, value=0.667, step=0.001, label='噪声比例 noise_scale')
noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
with gr.Column():
text_output = gr.Textbox(label="Output Text")
phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
btn = gr.Button("Generate!")
cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
download = gr.Button("Download Audio")
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
with gr.Accordion(label="Speaking Pace Control", open=True):
duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
interactive = True)
gr.Markdown(
"The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
"You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
"Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
"音素冒号后的数字代表音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
"您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
"注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
)
btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
outputs=[text_output, audio_output, phoneme_output, duration_output])
cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
outputs=[phoneme_output, audio_output])
examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
['お疲れ様です,トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
['張り切っていこう!', '67:北部玄驹', '日本語', 1, 0.667, 0.8, False],
['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '10:草上飞', '日本語', 1, 0.667, 0.8, False],
['授業中に出しだら,学校生活終わるですわ。', '12:目白麦昆', '日本語', 1, 0.667, 0.8, False],
['お帰りなさい,お兄様!', '29:米浴', '日本語', 1, 0.667, 0.8, False],
['私の処女をもらっでください!', '29:米浴', '日本語', 1, 0.667, 0.8, False]]
gr.Examples(
examples=examples,
inputs=[textbox, char_dropdown, language_dropdown,
duration_slider, noise_scale_slider,noise_scale_w_slider, symbol_input],
outputs=[text_output, audio_output],
fn=infer
)
gr.Markdown("# Updates Logs 更新日志:\n\n"
"2023/1/24:\n\n"
"Improved the format of phoneme length control.\n\n"
"改善了音素控制的格式。\n\n"
"2023/1/24:\n\n"
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
"增加了对说话节奏的音素级控制。\n\n"
"2023/1/13:\n\n"
"Added one example of phoneme input.\n\n"
"增加了音素输入的example(米浴喘气)\n\n"
"2023/1/12:\n\n"
"Added phoneme input, which enables more precise control on output audio.\n\n"
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
"Adjusted UI arrangements.\n\n"
"调整了UI的布局。\n\n"
"2023/1/10:\n\n"
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
"2023/1/9:\n\n"
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
)
app.queue(concurrency_count=3).launch(show_api=False, share=args.share) |