Spaces:

innnky
/

vits-nyaru

Running

App Files Files Community

rcell commited on Aug 18, 2022

Commit

cb3140f

1 Parent(s): 7bbcb75

update advance

Browse files

Files changed (1) hide show

app.py +37 -9

app.py CHANGED Viewed

@@ -15,8 +15,8 @@ import utils
 from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
 from models import SynthesizerTrn
 from text.symbols import symbols
-from text import text_to_sequence
 from scipy.io.wavfile import write
@@ -64,15 +64,43 @@ def tts(text):
         # print(stn_tst.size())
         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
             0, 0].data.float().numpy()
-    return "成功", (hps.data.sampling_rate, audio)
 app = gr.Blocks()
 with app:
-    tts_input1 = gr.TextArea(label="请输入日语文本", value="こんにちは。")
-    # tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
-    tts_submit = gr.Button("Generate", variant="primary")
-    tts_output1 = gr.Textbox(label="Output Message")
-    tts_output2 = gr.Audio(label="Output Audio")
-    tts_submit.click(tts, [tts_input1], [tts_output1, tts_output2])
     app.launch()

 from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
 from models import SynthesizerTrn
 from text.symbols import symbols
+from text import text_to_sequence, cleaned_text_to_sequence
+from text.cleaners import japanese_cleaners
 from scipy.io.wavfile import write
         # print(stn_tst.size())
         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
             0, 0].data.float().numpy()
+    return (hps.data.sampling_rate, audio)
+def clean_text(text):
+    return japanese_cleaners(text)
+def generate_from_clean(text):
+    sid = torch.LongTensor([2])  # speaker identity
+    text_norm = cleaned_text_to_sequence(text)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    stn_tst = torch.LongTensor(text_norm)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
+            0, 0].data.float().numpy()
+    return (hps.data.sampling_rate, audio)
 app = gr.Blocks()
 with app:
+    with gr.Tabs():
+        with gr.TabItem("基本"):
+            tts_input1 = gr.TextArea(label="请输入日语文本", value="こんにちは。")
+            # tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
+            tts_submit = gr.Button("生成", variant="primary")
+            # tts_output1 = gr.Textbox(label="Output Message")
+            tts_output2 = gr.Audio(label="输出")
+            tts_submit.click(tts, [tts_input1], [tts_output2])
+        with gr.TabItem("高级"):
+            tts_input3 = gr.TextArea(label="请输入日语文本", value="こんにちは。")
+            tts_s1 = gr.Button("清理", variant="primary")
+            tts_input4 = gr.TextArea(label="调整调形", value="ko↑Nniʧiwa.")
+            tts_s2 = gr.Button("生成", variant="primary")
+            tts_o = gr.Audio(label="输出")
+            tts_s1.click(clean_text, [tts_input3], [ tts_input4])
+            tts_s2.click(generate_from_clean, [tts_input4], [tts_o])
     app.launch()