Spaces:

GroveStreet
/

GTA_SOVITS

Running

App Files Files Community

Katock commited on Aug 9, 2023

Commit

a73a5ce

1 Parent(s): 054c3da

tts

Browse files

Files changed (1) hide show

app.py +54 -35

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import argparse
 import io
 import logging
 import os
 import gradio as gr
 import gradio.processing_utils as gr_processing_utils
 import librosa
@@ -19,36 +19,16 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
 limitation = os.getenv("SYSTEM") == "spaces"  # limit audio length in huggingface spaces
-# audio_postprocess_ori = gr.Audio.postprocess
-#
-#
-# def audio_postprocess(self, y):
-#     data = audio_postprocess_ori(self, y)
-#     if data is None:
-#         return None
-#     return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
-# gr.Audio.postprocess = audio_postprocess
-def create_vc_fn(model, spk):
-    def vc_fn(input_audio, vc_transform, auto_f0, f0p):
         if input_audio is None:
             return "请先上传音频", None
         sampling_rate, audio = input_audio
         audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
         if len(audio.shape) > 1:
             audio = librosa.to_mono(audio.transpose(1, 0))
-        # raw_audio_path = io.BytesIO()
-        # soundfile.write(raw_audio_path, audio, sampling_rate, format="wav")
-        # raw_audio_path.seek(0)
         temp_path = "temp.wav"
         soundfile.write(temp_path, audio, sampling_rate, format="wav")
         out_audio = model.slice_inference(raw_audio_path=temp_path,
                                           spk=spk,
                                           slice_db=-40,
@@ -58,12 +38,38 @@ def create_vc_fn(model, spk):
                                           tran=vc_transform,
                                           f0_predictor=f0p,
                                           auto_predict_f0=auto_f0)
         os.remove(temp_path)
         return 44100, out_audio
-    return vc_fn
 if __name__ == '__main__':
@@ -78,24 +84,36 @@ if __name__ == '__main__':
         name = f
         model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
         cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
-        models.append((name, cover, create_vc_fn(model, name)))
     with gr.Blocks() as app:
         gr.Markdown(
             "# <center> GTASA人物SOVITS4.1(升级中，可能有bug)\n"
             "## <center> 模型作者：B站Cyber蝈蝈总\n"
-            "<center> 使用此处资源创作的作品，请显著标明模型出处（B站Cyber蝈蝈总），这是唯一的要求\n"
         )
         with gr.Tabs():
-            for (name, cover, vc_fn) in models:
                 with gr.TabItem(name):
                     with gr.Row():
                         with gr.Column():
-                            vc_input = gr.Audio(label="上传干声 (已支持长音频)" if limitation else '')
-                            vc_transform = gr.Number(label="音高调整 (支持正负半音，12为一个八度)", value=0)
-                            auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
-                            f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
-                                                    choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
-                            vc_submit = gr.Button("生成", variant="primary")
                         with gr.Column():
                             gr.Markdown(
@@ -104,5 +122,6 @@ if __name__ == '__main__':
                                                                                                            '</div>'
                             )
                             vc_output = gr.Audio(label="输出音频")
-                vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, f0_predictor], vc_output)
         app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)

 import io
 import logging
 import os
+import subprocess
 import gradio as gr
 import gradio.processing_utils as gr_processing_utils
 import librosa
 limitation = os.getenv("SYSTEM") == "spaces"  # limit audio length in huggingface spaces
+def create_fn(model, spk):
+    def svc_fn(input_audio, vc_transform, auto_f0, f0p):
         if input_audio is None:
             return "请先上传音频", None
         sampling_rate, audio = input_audio
         audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
         if len(audio.shape) > 1:
             audio = librosa.to_mono(audio.transpose(1, 0))
         temp_path = "temp.wav"
         soundfile.write(temp_path, audio, sampling_rate, format="wav")
         out_audio = model.slice_inference(raw_audio_path=temp_path,
                                           spk=spk,
                                           slice_db=-40,
                                           tran=vc_transform,
                                           f0_predictor=f0p,
                                           auto_predict_f0=auto_f0)
         os.remove(temp_path)
+        return 44100, out_audio
+    def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
+        voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
+        output_file = "temp.wav"
+        if tts_rate >= 0:
+            ratestr = "+{:.0%}".format(tts_rate)
+        elif tts_rate < 0:
+            ratestr = "{:.0%}".format(tts_rate)  # 减号自带
+        p = subprocess.Popen("edge-tts " +
+                             " --text " + input_text +
+                             " --write-media " + output_file +
+                             " --voice " + voice +
+                             " --rate=" + ratestr, shell=True,
+                             stdout=subprocess.PIPE,
+                             stdin=subprocess.PIPE)
+        p.wait()
+        out_audio = model.slice_inference(raw_audio_path=output_file,
+                                          spk=spk,
+                                          slice_db=-40,
+                                          cluster_infer_ratio=0,
+                                          noice_scale=0.4,
+                                          clip_seconds=20,
+                                          tran=vc_transform,
+                                          f0_predictor=f0p,
+                                          auto_predict_f0=auto_f0)
+        os.remove(output_file)
         return 44100, out_audio
+    return svc_fn, tts_fn
 if __name__ == '__main__':
         name = f
         model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
         cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
+        models.append((name, cover, create_fn(model, name)))
     with gr.Blocks() as app:
         gr.Markdown(
             "# <center> GTASA人物SOVITS4.1(升级中，可能有bug)\n"
             "## <center> 模型作者：B站Cyber蝈蝈总\n"
+            "<center> 使用此处资源创作的作品，请显著标明模型出处（B站Cyber蝈蝈总）\n"
         )
         with gr.Tabs():
+            for (name, cover, svc_fn, tts_fn) in models:
                 with gr.TabItem(name):
                     with gr.Row():
                         with gr.Column():
+                            mode = gr.Radio(label='模式', value='音频转音频', choices=['文字转音频', '音频转音频'])
+                            if mode == '音频转音频':
+                                svc_input = gr.Audio(label="上传干声 (已支持无限长音频，处理时间约为原音频时间的5倍)")
+                                vc_transform = gr.Number(label="音高调整 (支持正负半音，12为一个八度)", value=0)
+                                auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
+                                f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
+                                                        choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
+                                vc_submit = gr.Button("生成", variant="primary")
+                            else:
+                                text_input = gr.Textbox(label='说话内容', value='',
+                                                        placeholder='请输入说话内容，(已支持无限长内容，处理时间约为说完原内容时间的5倍)')
+                                gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
+                                tts_rate = gr.Number(label='语速(正负百分比)', value=0)
+                                vc_transform = gr.Number(label="音高调整 (支持正负半音，12为一个八度)", value=0)
+                                auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
+                                f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
+                                                        choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
+                                tts_submit = gr.Button("生成", variant="primary")
                         with gr.Column():
                             gr.Markdown(
                                                                                                            '</div>'
                             )
                             vc_output = gr.Audio(label="输出音频")
+                vc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
+                tts_submit.click(tts_fn, [text_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor], vc_output)
         app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)