vits_onnx / app /main.py
chocolatedesue
init
223aff6
# from multiprocessing import Process
import numpy as np
from .util import find_path_by_suffix, time_it
from loguru import logger
from .util import intersperse
from .config import Config
from .text import text_to_sequence
import gradio as gr
# import sys
# sys.path.append('..')
def text_to_seq(text: str):
text = Config.pattern.sub(' ', text).strip()
text_norm = text_to_sequence(
text, Config.hps.symbols, Config.hps.data.text_cleaners)
if Config.hps.data.add_blank:
text_norm = intersperse(text_norm, 0)
return text_norm
@time_it
@logger.catch
def tts_fn(text, speaker_id, speed=1.0):
if len(text) > 300:
return "Error: Text is too long, please down it to 300 characters", None
if not Config.model_is_ok:
return "Error: model not loaded, please wait for a while or look the log", None
seq = text_to_seq(text)
x = np.array([seq], dtype=np.int64)
x_len = np.array([x.shape[1]], dtype=np.int64)
sid = np.array([speaker_id], dtype=np.int64)
speed = 1/speed
scales = np.array([0.667, speed, 0.8], dtype=np.float32)
scales.resize(1, 3)
ort_inputs = {
'input': x,
'input_lengths': x_len,
'scales': scales,
'sid': sid
}
audio = np.squeeze(Config.ort_sess.run(None, ort_inputs))
audio *= 32767.0 / max(0.01, np.max(np.abs(audio))) * 0.6
audio = np.clip(audio, -32767.0, 32767.0)
return "success", (Config.hps.data.sampling_rate, audio.astype(np.int16))
def set_gradio_view():
app = gr.Blocks()
with app:
gr.Markdown(
"a demo of web service of vits, thanks to @CjangCjengh, copy from [link](https://huggingface.co/spaces/skytnt/moe-japanese-tts)")
with gr.Tabs():
with gr.TabItem("TTS"):
with gr.Column():
tts_input1 = gr.TextArea(
label="TTS_text", value="ใ‚ใŸใ—ใฎ่ถฃๅ‘ณใฏใŸใใ•ใ‚“ใ‚ใ‚Šใพใ™ใ€‚ใงใ‚‚ใ€ไธ€็•ชๅฅฝใใชไบ‹ใฏๅ†™็œŸใ‚’ใจใ‚‹ใ“ใจใงใ™ใ€‚")
tts_input2 = gr.Dropdown(
label="Speaker", choices=Config.speaker_choices, type="index", value=Config.speaker_choices[0])
tts_input3 = gr.Slider(
label="Speed", value=1, minimum=0.2, maximum=3, step=0.1)
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Textbox(label="Output Message")
tts_output2 = gr.Audio(label="Output Audio")
inputs = [
tts_input1, tts_input2, tts_input3
]
outputs = [
tts_output1, tts_output2]
tts_submit.click(tts_fn, inputs=inputs, outputs=outputs)
app.queue(concurrency_count=3)
gr.close_all()
app.launch(server_name='0.0.0.0', show_api=False,
share=False, server_port=7860)
def main():
# p = Process(target=Config.init)
# p.start()
Config.init()
set_gradio_view()
if __name__ == '__main__':
main()